2023数据采集与融合技术实践作业二

发布时间 2023-10-10 16:27:40作者: 人生几何5

作业①:

要求:在中国气象网(http://www.weather.com.cn)给定城市集的 7 日天气预报,并保存在数据库。

输出信息:作业2 · EI-Shaddoll-Midrash/2023数据采集 - 码云 - 开源中国 (gitee.com)

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
class WeatherDB:
    
    #创建数据库
    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                "create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")
    #关闭数据库
    def closeDB(self):
        self.con.commit()
        self.con.close()
    #插入数据
    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)
    #输出数据库中内容
    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    #访问网址的准备
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
    #访问网址
    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return
        #爬取数据
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    
    #对数据库操作
    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        
        for city in cities:
            self.forecastCity(city)
        
        print('\n')
        print('开始输出数据库')
        self.db.show()
        self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京","上海","广州","深圳"])
print("completed")


心得体会:本次实验让我理解了爬虫与数据库之间的连接

作业②

要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并

存储在数据库中。

候选网站:东方财富网:https://www.eastmoney.com/
新浪股票:http://finance.sina.com.cn/stock/

技巧

在谷歌浏览器中进入 F12 调试模式进行抓包,查找股票列表加
载使用的 url,并分析 api 返回的值,并根据所要求的参数可适当更改
api 的请求参数。根据 URL 可观察请求的参数 f1、f2 可获取不同的数
值,根据情况可删减请求的参数。
参考链接:https://zhuanlan.zhihu.com/p/50099084

输出信息:

https://gitee.com/EI-Shaddoll-Midrash/2023-data-collection/tree/master/作业2

import re
import json
import requests
import pandas as pd

headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"}
#进入网址并获得元素
def getHtml(page):
    url = "http://20.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405392750108101088_1696660211532&pn="+str(page)+ "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696660211533"
    r = requests.get(url, headers=headers).text
    pat0 = '"diff":\[\{(.*?)\}\]'
    data = re.compile(pat0, re.S).findall(r)
    return data

def getOnePageStock(page):
    data = getHtml(page)
    datas = data[0].split('},{')
    stocks = []
    for i in range(len(datas)):
        str1 = r'"(\w)+":'
        stock = re.sub(str1," ",datas[i])
        stock = stock.split(",")
        stocks.append(stock)
    return stocks
#爬虫
def main():
    page = 1
    stocks = getOnePageStock(page)
    # 自动爬取固定页,并在结束时停止
    while page<=30:
        page += 1
        if getHtml(page) != getHtml(page - 1):
            stocks.extend(getOnePageStock(page))
            print("已加载第"+str(page)+"页")
        else:
            break
    df = pd.DataFrame(stocks)
    #对内容进行处理并读入excel表格中
    df.drop([0, 7, 8, 9, 10, 12, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], axis=1, inplace=True)
    # print(df)
    df.index = range(1, len(df) + 1)
    df = df[[11, 13, 1, 2, 3, 4, 5, 6, 14, 15, 16, 17]]
    columns = {11: "股票代码", 13: "股票名称", 1: "最新报价", 2: "涨跌幅", 3: "涨跌额", 4: "成交量", 5: "成交额",
               6: "振幅", 14: "最高", 15: "最低", 16: "今开", 17: "昨收"}
    df = df.rename(columns=columns)
    df.to_excel("./股票.xlsx")
    print("已保存文件")
main()


心得体会:

理解了利用js包对内容进行爬取

作业③:

要求:爬取中国大学 2021 主榜

https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信
息,并存储在数据库中,同时将浏览器 F12 调试分析的过程录制 Gif 加
入至博客中。

技巧:分析该网站的发包情况,分析获取数据的 api

输出信息:

https://gitee.com/EI-Shaddoll-Midrash/2023-data-collection/tree/master/作业2

import requests
import pandas as pd
import re

url = "https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js"

resquest = requests.get(url=url)
name_grep = ',univNameCn:"(.*?)",'
name = re.findall(name_grep,resquest.text)#获取学校名称
score_grep = ',score:(.*?),'
score = re.findall(score_grep,resquest.text)#获取学校总分
category_grep = ',univCategory:(.*?),'
category = re.findall(category_grep,resquest.text)#获取学校类型
province_grep = ',province:(.*?),'
province = re.findall(province_grep,resquest.text)#获取学校所在省份

code_name_grep = 'function(.*?){'
code_name = re.findall(code_name_grep,resquest.text)
start_code = code_name[0].find('a')
end_code = code_name[0].find('pE')
code_name = code_name[0][start_code:end_code].split(',')#将function中的参数取出并存在code_name列表中

value_name_grep ='mutations:(.*?);'
value_name = re.findall(value_name_grep,resquest.text)
start_value = value_name[0].find('(')
end_value = value_name[0].find(')')
value_name = value_name[0][start_value+1:end_value].split(",") #将参数所对应的含义取出存在value_name列表中

df = pd.DataFrame(columns=["排名","学校","省份","类型","总分"])
for i in range(len(name)):
    province_name = value_name[code_name.index(province[i])][1:-1]
    category_name = value_name[code_name.index(category[i])][1:-1]
    df.loc[i] = [i+1,name[i],province_name,category_name,score[i]]
print(df)
df.to_excel("./school.xlsx")


心得体会

通过本次实验让我学会了如何使用panda包进行数据保存,让我受益匪浅。