2023数据采集与融合第二次作业

发布时间 2023-10-09 22:51:58作者: nunusu

作业①:

实验内容

题目:

代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3


class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.50"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601", "海淀": "101010200"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    try:
                        temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    except IndexError:
                        temp = li.select('p[class="tem"] i')[0].text  # 有时 ‘今天’只显示一个温度,没有最高温和最低温
                    # print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        for city in cities:
            self.forecastCity(city)
        self.db.show()
        self.db.closeDB()


ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳", "海淀"])
print("completed")

运行结果:

心得体会

学习了把数据保存在数据库中的有关操作。

作业②:

实验内容

题目

代码
import requests
import re
import pandas as pd
fid_fs = {
    "沪深京A股": "fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
    "上证A股": "fid=f3&fs=m:1+t:2,m:1+t:23",
    "深证A股": "fid=f3&fs=m:0+t:6,m:0+t:80",
    "北证A股": "fid=f3&fs=m:0+t:81+s:2048",
    "新股": "fid=f26&fs=m:0+f:8,m:1+f:8",
    "创业版": "fid=f3&fs=m:0+t:80",
    "科创版": "fid=f3&fs=m:1+t:23",
    "沪股通": "fid=f26&fs=b:BK0707",
    "深股通": "fid=f26&fs=b:BK0804",
    "B股": "fid=f3&fs=m:0+t:7,m:1+t:3",
    "风险警示版": "fid=f3&fs=m:0+f:4,m:1+f:4",
    "两网及退市": "fid=f3&fs=m:0+s:3"
}


def get_stock(key):
    fields = "f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f14,f15,f16,f17,f18,f23" + (lambda x: x)(",f26" if (fid_fs[key][5:7] == "26") else "")
    stocks = []
    while 1:
        try:
            page = int(input("请输入要爬取 {} 的页数:".format(key)))
            break
        except ValueError:
            print("wrong input!")
    # page = 3
    for index in range(1, page+1):
        url = "http://1.push2.eastmoney.com/api/qt/clist/get?pn=" + str(index) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&"+fid_fs[key]+"&fields="+fields
        html = requests.get(url).text
        data = []
        data = re.findall("\"diff\":.*", html)
        if len(data) == 0:
            print("已无更多股票")
            break
        datas = data[0].split("},{")
        datas[0] = datas[0].split("{")[1]
        datas[-1] = datas[-1].split("}")[0]

        for i in range(len(datas)):
            stock0 = datas[i].replace('"', "").split(",")  # 初始数据
            list_ = [9, 10, 0, 1, 2, 3, 4, 5, 11, 12, 13, 14, 8, 6, 7, 15]
            stock = []
            for j in list_:
                stock.append(stock0[j].split(":")[1])  # 调整位置并去掉前面的f
            if len(stock0) == 17:
                stock.append(stock0[16].split(":")[1])
            stocks.append(stock)
        print("第{}页爬取成功".format(index))

    # print(stocks)
    df = pd.DataFrame(stocks)
    columns = {0: "代码", 1: "名称", 2: "最新价", 3: "涨跌额", 4: "涨跌幅", 5: "成交量", 6: "成交额", 7: "振幅", 8: "最高", \
               9: "最低", 10: "今开", 11: "昨收", 12: "量比", 13: "换手率", 14: "市盈率(动态)", 15: "市净率", 16: "上市时间"}
    df.rename(columns=columns, inplace=True)
    df.to_excel(xlsx, sheet_name=key, index=False)
    print(key + "已保存")


if __name__ == "__main__":

    xlsx = pd.ExcelWriter("沪深京个股.xlsx")
    for key in fid_fs:
        get_stock(key)
    xlsx.close()

运行结果:

心得体会

通过f12调试模式进行抓包,发现加载沪深京A股股票列表使用的url。对每种url进行删减判断,知道了fields中需要数据的f值,pn的值是页数,不同股票的主要差别在fs和fid,一些特殊股票比如新股的fid=f26是其股票数据中额外需要上市时间,而上市时间的值就保存在f26中,估计fid是强调并获取其他股票不会显示的数据。

作业③

实验内容

题目:

f12调试.gif:

代码
import requests
import re
import pandas as pd

url = "https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/202111/payload.js"
r = requests.get(url)
r.encoding = "utf-8"
html = r.text
# print(html)

Dict = {}  # 部分数据以函数参数表示,需要找出其对应的值
KEY = re.findall("function\((.*?)\)", html)
keys = KEY[0].split(",")
# print(keys)
VALUE = re.findall("mutations:\[]}}\((.*?)\)", html)
v = VALUE[0].split(",\"2023,2022,2021,2020\",")  # 特殊处理
values = []
values = values + v[0].replace("\"", "").split(",")
values = values + ["2023,2022,2021,2020"]
values = values + v[1].replace("\"", "").split(",")
# print(values)
for i in range(len(keys)):
    Dict[keys[i]] = values[i]
# print(Dict)

univNameCn = re.findall("univNameCn:\"(.*?)\",", html)
province = re.findall("province:(.*?),", html)
univCategory = re.findall("univCategory:(.*?),", html)
score = re.findall("score:(.*?),", html)

unis = []
for i in range(582):
    uni = []
    uni.append(i+1)
    uni.append(univNameCn[i])
    uni.append(Dict[province[i]])
    uni.append(Dict[univCategory[i]])
    try:
        uni.append(float(Dict[score[i]]))
    except KeyError:
        uni.append(float(score[i]))
    unis.append(uni)

df = pd.DataFrame(unis)
columns = {0: "排名", 1: "学校", 2: "省市", 3: "类型", 4: "总分"}
df.rename(columns=columns, inplace=True)
df.to_excel("CnSchoolRanking.xlsx", sheet_name='2021', index=False)
print("保存成功")

运行结果:

心得体会

通过f12抓包,发现大学排名的数据都储存在payload.js中。刷新页面后的第一个payload.js存的是中国大学排名(主榜)的数据,访问其加载数据的url,正常获取数据。在获取数据的过程中发现有些数据是字母,不知道值是多少。分析payload.js后发现这就像一个函数,function后跟着是参数,在末尾是参数对应的值,re匹配这些部分并组合成字典,方便之后对这些参数值的替换。