作业①:
实验内容
题目:
- 要求:在中国气象网(http://www.weather.com.cn)给定城市集的 7日天气预报,并保存在数据库
代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
def openDB(self):
self.con=sqlite3.connect("weathers.db")
self.cursor=self.con.cursor()
try:
self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601", "海淀": "101010200"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
try:
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
except IndexError:
temp = li.select('p[class="tem"] i')[0].text # 有时 ‘今天’只显示一个温度,没有最高温和最低温
# print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
self.db.show()
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳", "海淀"])
print("completed")
运行结果:

心得体会
学习了把数据保存在数据库中的有关操作。
作业②:
实验内容
题目:
-
要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。
-
候选网站:东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/
代码
import requests
import re
import pandas as pd
fid_fs = {
"沪深京A股": "fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
"上证A股": "fid=f3&fs=m:1+t:2,m:1+t:23",
"深证A股": "fid=f3&fs=m:0+t:6,m:0+t:80",
"北证A股": "fid=f3&fs=m:0+t:81+s:2048",
"新股": "fid=f26&fs=m:0+f:8,m:1+f:8",
"创业版": "fid=f3&fs=m:0+t:80",
"科创版": "fid=f3&fs=m:1+t:23",
"沪股通": "fid=f26&fs=b:BK0707",
"深股通": "fid=f26&fs=b:BK0804",
"B股": "fid=f3&fs=m:0+t:7,m:1+t:3",
"风险警示版": "fid=f3&fs=m:0+f:4,m:1+f:4",
"两网及退市": "fid=f3&fs=m:0+s:3"
}
def get_stock(key):
fields = "f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f14,f15,f16,f17,f18,f23" + (lambda x: x)(",f26" if (fid_fs[key][5:7] == "26") else "")
stocks = []
while 1:
try:
page = int(input("请输入要爬取 {} 的页数:".format(key)))
break
except ValueError:
print("wrong input!")
# page = 3
for index in range(1, page+1):
url = "http://1.push2.eastmoney.com/api/qt/clist/get?pn=" + str(index) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&"+fid_fs[key]+"&fields="+fields
html = requests.get(url).text
data = []
data = re.findall("\"diff\":.*", html)
if len(data) == 0:
print("已无更多股票")
break
datas = data[0].split("},{")
datas[0] = datas[0].split("{")[1]
datas[-1] = datas[-1].split("}")[0]
for i in range(len(datas)):
stock0 = datas[i].replace('"', "").split(",") # 初始数据
list_ = [9, 10, 0, 1, 2, 3, 4, 5, 11, 12, 13, 14, 8, 6, 7, 15]
stock = []
for j in list_:
stock.append(stock0[j].split(":")[1]) # 调整位置并去掉前面的f
if len(stock0) == 17:
stock.append(stock0[16].split(":")[1])
stocks.append(stock)
print("第{}页爬取成功".format(index))
# print(stocks)
df = pd.DataFrame(stocks)
columns = {0: "代码", 1: "名称", 2: "最新价", 3: "涨跌额", 4: "涨跌幅", 5: "成交量", 6: "成交额", 7: "振幅", 8: "最高", \
9: "最低", 10: "今开", 11: "昨收", 12: "量比", 13: "换手率", 14: "市盈率(动态)", 15: "市净率", 16: "上市时间"}
df.rename(columns=columns, inplace=True)
df.to_excel(xlsx, sheet_name=key, index=False)
print(key + "已保存")
if __name__ == "__main__":
xlsx = pd.ExcelWriter("沪深京个股.xlsx")
for key in fid_fs:
get_stock(key)
xlsx.close()
运行结果:


心得体会
通过f12调试模式进行抓包,发现加载沪深京A股股票列表使用的url。对每种url进行删减判断,知道了fields中需要数据的f值,pn的值是页数,不同股票的主要差别在fs和fid,一些特殊股票比如新股的fid=f26是其股票数据中额外需要上市时间,而上市时间的值就保存在f26中,估计fid是强调并获取其他股票不会显示的数据。
作业③
实验内容
题目:
- 要求:爬取中国大学 2021主榜 (https://www.shanghairanking.cn/rankings/bcur/2021) 所有院校信息,并存储在数据库中,同时将浏览器 F12 调试分析的过程录制 Gif 加入至博客中。
f12调试.gif:

代码
import requests
import re
import pandas as pd
url = "https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/202111/payload.js"
r = requests.get(url)
r.encoding = "utf-8"
html = r.text
# print(html)
Dict = {} # 部分数据以函数参数表示,需要找出其对应的值
KEY = re.findall("function\((.*?)\)", html)
keys = KEY[0].split(",")
# print(keys)
VALUE = re.findall("mutations:\[]}}\((.*?)\)", html)
v = VALUE[0].split(",\"2023,2022,2021,2020\",") # 特殊处理
values = []
values = values + v[0].replace("\"", "").split(",")
values = values + ["2023,2022,2021,2020"]
values = values + v[1].replace("\"", "").split(",")
# print(values)
for i in range(len(keys)):
Dict[keys[i]] = values[i]
# print(Dict)
univNameCn = re.findall("univNameCn:\"(.*?)\",", html)
province = re.findall("province:(.*?),", html)
univCategory = re.findall("univCategory:(.*?),", html)
score = re.findall("score:(.*?),", html)
unis = []
for i in range(582):
uni = []
uni.append(i+1)
uni.append(univNameCn[i])
uni.append(Dict[province[i]])
uni.append(Dict[univCategory[i]])
try:
uni.append(float(Dict[score[i]]))
except KeyError:
uni.append(float(score[i]))
unis.append(uni)
df = pd.DataFrame(unis)
columns = {0: "排名", 1: "学校", 2: "省市", 3: "类型", 4: "总分"}
df.rename(columns=columns, inplace=True)
df.to_excel("CnSchoolRanking.xlsx", sheet_name='2021', index=False)
print("保存成功")
运行结果:


心得体会
通过f12抓包,发现大学排名的数据都储存在payload.js中。刷新页面后的第一个payload.js存的是中国大学排名(主榜)的数据,访问其加载数据的url,正常获取数据。在获取数据的过程中发现有些数据是字母,不知道值是多少。分析payload.js后发现这就像一个函数,function后跟着是参数,在末尾是参数对应的值,re匹配这些部分并组合成字典,方便之后对这些参数值的替换。