数据采集与融合技术实践作业二-JZTXT

作业一:

要求：

在中国气象网（http://www.weather.com.cn）给定城市集合的7日天气预报，并保存在数据库。

输出信息：

Gitee文件夹链接

实验：

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
#数据库
class WeatherDB:
    #打开数据库
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")
    #关闭数据库
    def closeDB(self):
        self.con.commit()
        self.con.close()
    #加入元组
    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)
    #打印数据
    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        i=1
        print("{:4}\t{:10}\t{:14}\t{:24}\t{:16}".format("序号","城市", "日期", "天气信息", "温度"))
        for row in rows:
            print("{:4}\t{:10}\t{:10}\t{:24}\t{:16}".format(i,row[0], row[1], row[2], row[3]))
            i+=1
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        #城市代码
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li") #找到li元素
            for li in lis:
                try:
                    date = li.select('h1')[0].text  #日期
                    weather = li.select('p[class="wea"]')[0].text  # 天气
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text #温度
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)
    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()
        for city in cities:
            self.forecastCity(city)
        self.db.show()
        self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")

结果：

心得体会：

本次实验我使用了urllib和bs4库完成对信息的爬取，难点在于多页爬取。主要的解决方法是：一是查找数据接口，找到不同数据页面所需要的请求格式，二是使用selenium找到解析html源码，找到按钮标签。

作业二:

要求：

用requests和自选提取信息方法定向爬取股票相关信息，并存储在数据库中。

输出信息：

Gitee文件夹链接

实验：

import requests
import re
import math
#用get方法访问服务器并提取页面数据
def getHtml(fs,page,k):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3141.8 Safari/537.36}"
    }
    #页数是通过修改pn值，不同股票的url有两个地方不同，通过之后的字典来实现。
    url = "http://35.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112402976183355999211_1601533720950&pn="+str(page)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs="+fs+"&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_="+k
    r = requests.get(url,headers=headers)
    pat = "\"data\":.*\]"
    data = re.compile(pat, re.S).findall(r.text)
    # 获取数据总条数，用于主函数中计算总页数，每页有20条数据，相除向上取整为总页数。
    t=re.findall(r"\"total\":[\d]*",data[0])
    to=t[0].split(':')
    total=eval(to[1])
    # 去除前面的total和diff数据
    data[0]=data[0][29:]
    return data,total
#获取单个页面股票数据
def getOnePageStock(fs,page,k,number,i):
    data = getHtml(fs,page,k)
    data=data[0]
    datas = data[0].split('},{') #不同股票分割
    stocks = []
    find=0  #设置find来判断是否找到了
    for t in range(len(datas)):
        stock = datas[t].replace('"',"").split(',')#不同信息分割，并且去掉引号
        for j in range(len(stock)):
            # 冒号分割，获取信息的值
            t=stock[j].split(':')
            stock[j]=t[1]
        if stock[11] == number:
            stocks.append(stock)
            find=1 #找到，就可以输出，停止循环了
            print(i)
            print("在第"+str(page)+"页:")
            print("{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}".format("代码", "名称", "今开", "最高", "最低","涨跌幅","换手率","成交量"))
            print("{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}\t{:4}".format(stock[11],stock[13],stock[16],stock[14],stock[15],stock[2],stock[7],stock[4]))
            break
    return find
def main():
    fs = {
        "沪深A股":"m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
        "上证A股":"m:1+t:2,m:1+t:23",
        "深证A股":"m:0+t:6,m:0+t:13,m:0+t:80",
        "新股":"m:0+f:8,m:1+f:8",
        "中小板":"m:0+t:13",
        "创业板":"m:0+t:80"
    }
    k = {
        "沪深A股": "1601536578738",
        "上证A股": "1601536578736",
        "深证A股": "1601536578759",
        "新股": "1601536578765",
        "中小板": "1601536578882",
        "创业板": "1601536578888"
    }
    #自选三位加学号后三位
    number='603119'
    for i in fs.keys():
        page = 1
        iffind = getOnePageStock(fs[i],page,k[i],number,i)
        #计算总页数
        total=getHtml(fs[i], 1, k[i])
        totalpage=math.ceil(total[1]/20.0)
        #自动爬取多页
        while True:
            page += 1
            if page <= totalpage and iffind !=1: #如果小于总页数，并且没找到，就继续找
                iffind=getOnePageStock(fs[i], page,k[i],number,i)
                #print(i+"已加载第"+str(page)+"页")
            else:
                break
main()

结果：

心得体会：

本次任务requests和自选提取信息方法定向爬取股票相关信息，并存储在数据库中。通过这个任务，我掌握了REQUEST库和GET方法。我开始觉得任务十分困难，后来，在同学的帮助下，我成功爬取了网页。

作业三：

要求：

爬取中国大学2021主榜（https://www.shanghairanking.cn/rankings/bcur/2021）所有院校信息，并存储在数据库中，同时将浏览器F12调试分析的过程录制Gif加入至博客中。

技巧：

分析该网站的发包情况，分析获取数据的api。

输出信息：

Gitee文件夹链接

实验：

import requests  # 方式1获取URL信息
import urllib.request  # 方式2获取URL信息
from bs4 import BeautifulSoup
import bs4
# 从网络上获取大学排名网页内容。
def getHTMLText(url):  # 获取URL信息，输出内容
    # =========================方式1获取=========================
    try:
        res = requests.get(url)  # 使用requests库爬取
        res.raise_for_status()  # 产生异常信息
        res.encoding = res.apparent_encoding  # 修改编码
        return res.text  # 返回网页编码
    except Exception as err:
        print(err)
    # =========================方式2获取=========================
    try:
        req = urllib.request.Request(url)
        # 打开URL网站的网址，读出二进制数据，二进制数据转为字符串
        data = urllib.request.urlopen(req).read.decode()
        return data
    except Exception as err:
        print(err)
# 提取网页内容中信息到合适的数据结构.
def fillUnivList(ulist, html):  # 将html页面放到ulist列表中(核心)
    # 解析网页文件（使用html解释器）
    soup = BeautifulSoup(html, "html.parser")
    # soup.prettify()  # 把soup对象的文档树变换成一个字符串
    # 数据结构:所用数据都封装在一个表格(标签tbody)中，单个学校信息在tr标签中，详细信息在td标签中
    # 学校名称在a标签中，定义一个列表单独存放a标签内容
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):  # 如果tr标签的类型不是bs4库中定义的tag类型，则过滤掉
            a = tr('a')  # 把所用的a标签存为一个列表类型
            tds = tr('td')  # 将所有的td标签存为一个列表类型
            ulist.append([tds[0].text.strip(), a[0].string.strip(), tds[2].text.strip(),
                          tds[3].text.strip(), tds[4].text.strip()])
            # 使用strip()函数，它的作用是用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列
# 利用数据结构展示并输出结果:定义函数
def printUnivList(ulist1, num):  # 打印出ulist列表的信息，num表示希望将列表中的多少个元素打印出来
    # 格式化输出
    tplt = "{0:^10}\t{1:^10}\t{2:^12}\t{3:^12}\t{4:^10}"
    print(tplt.format("排名", "学校名称", "省份", "学校类型", "总分"))
    for i in range(num):
        u = ulist1[i]
        print(tplt.format(u[0], u[1], u[2], u[3], u[4]))
def main():
    uinfo = []  # 将大学信息放到列表中
    url = "https://www.shanghairanking.cn/rankings/bcur/2021"
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 25)  # 一个界面的数据
if __name__ == '__main__':
    main()

结果：

心得体会：

本次实验我使用了urllib和bs4库完成对大学排名信息的爬取，难点在于多页爬取。主要的解决方法是：一是查找数据接口，找到不同数据页面所需要的请求格式，二是使用selenium找到解析html源码，找到按钮标签。