作业①:
要求:在中国气象网(http://www.weather.com.cn)给定城市集的 7 日天气预报,并保存在数据库。
输出信息:作业2 · EI-Shaddoll-Midrash/2023数据采集 - 码云 - 开源中国 (gitee.com)
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
#创建数据库
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
except:
self.cursor.execute("delete from weathers")
#关闭数据库
def closeDB(self):
self.con.commit()
self.con.close()
#插入数据
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
#输出数据库中内容
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
class WeatherForecast:
#访问网址的准备
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
#访问网址
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " code cannot be found")
return
#爬取数据
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
#对数据库操作
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
print('\n')
print('开始输出数据库')
self.db.show()
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京","上海","广州","深圳"])
print("completed")


心得体会:本次实验让我理解了爬虫与数据库之间的连接
作业②
要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并
存储在数据库中。
候选网站:东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/
技巧
在谷歌浏览器中进入 F12 调试模式进行抓包,查找股票列表加
载使用的 url,并分析 api 返回的值,并根据所要求的参数可适当更改
api 的请求参数。根据 URL 可观察请求的参数 f1、f2 可获取不同的数
值,根据情况可删减请求的参数。
参考链接:https://zhuanlan.zhihu.com/p/50099084
输出信息:
https://gitee.com/EI-Shaddoll-Midrash/2023-data-collection/tree/master/作业2
import re
import json
import requests
import pandas as pd
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"}
#进入网址并获得元素
def getHtml(page):
url = "http://20.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405392750108101088_1696660211532&pn="+str(page)+ "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696660211533"
r = requests.get(url, headers=headers).text
pat0 = '"diff":\[\{(.*?)\}\]'
data = re.compile(pat0, re.S).findall(r)
return data
def getOnePageStock(page):
data = getHtml(page)
datas = data[0].split('},{')
stocks = []
for i in range(len(datas)):
str1 = r'"(\w)+":'
stock = re.sub(str1," ",datas[i])
stock = stock.split(",")
stocks.append(stock)
return stocks
#爬虫
def main():
page = 1
stocks = getOnePageStock(page)
# 自动爬取固定页,并在结束时停止
while page<=30:
page += 1
if getHtml(page) != getHtml(page - 1):
stocks.extend(getOnePageStock(page))
print("已加载第"+str(page)+"页")
else:
break
df = pd.DataFrame(stocks)
#对内容进行处理并读入excel表格中
df.drop([0, 7, 8, 9, 10, 12, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], axis=1, inplace=True)
# print(df)
df.index = range(1, len(df) + 1)
df = df[[11, 13, 1, 2, 3, 4, 5, 6, 14, 15, 16, 17]]
columns = {11: "股票代码", 13: "股票名称", 1: "最新报价", 2: "涨跌幅", 3: "涨跌额", 4: "成交量", 5: "成交额",
6: "振幅", 14: "最高", 15: "最低", 16: "今开", 17: "昨收"}
df = df.rename(columns=columns)
df.to_excel("./股票.xlsx")
print("已保存文件")
main()


心得体会:
理解了利用js包对内容进行爬取
作业③:
要求:爬取中国大学 2021 主榜
(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信
息,并存储在数据库中,同时将浏览器 F12 调试分析的过程录制 Gif 加
入至博客中。
技巧:分析该网站的发包情况,分析获取数据的 api
输出信息:
https://gitee.com/EI-Shaddoll-Midrash/2023-data-collection/tree/master/作业2
import requests
import pandas as pd
import re
url = "https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js"
resquest = requests.get(url=url)
name_grep = ',univNameCn:"(.*?)",'
name = re.findall(name_grep,resquest.text)#获取学校名称
score_grep = ',score:(.*?),'
score = re.findall(score_grep,resquest.text)#获取学校总分
category_grep = ',univCategory:(.*?),'
category = re.findall(category_grep,resquest.text)#获取学校类型
province_grep = ',province:(.*?),'
province = re.findall(province_grep,resquest.text)#获取学校所在省份
code_name_grep = 'function(.*?){'
code_name = re.findall(code_name_grep,resquest.text)
start_code = code_name[0].find('a')
end_code = code_name[0].find('pE')
code_name = code_name[0][start_code:end_code].split(',')#将function中的参数取出并存在code_name列表中
value_name_grep ='mutations:(.*?);'
value_name = re.findall(value_name_grep,resquest.text)
start_value = value_name[0].find('(')
end_value = value_name[0].find(')')
value_name = value_name[0][start_value+1:end_value].split(",") #将参数所对应的含义取出存在value_name列表中
df = pd.DataFrame(columns=["排名","学校","省份","类型","总分"])
for i in range(len(name)):
province_name = value_name[code_name.index(province[i])][1:-1]
category_name = value_name[code_name.index(category[i])][1:-1]
df.loc[i] = [i+1,name[i],province_name,category_name,score[i]]
print(df)
df.to_excel("./school.xlsx")


心得体会
通过本次实验让我学会了如何使用panda包进行数据保存,让我受益匪浅。