数据采集与融合技术第三次作业

发布时间 2023-11-01 23:19:26作者: 兔叽汁汁

作业①

爬取中国气象网

爬取中国气象网图片(单线程和多线程)

  1. 单线程
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request

def MySpider(url):
    global count
    try:
        urls = []
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "lxml")
        images = soup.select("img")
        for image in images:
            try:
                src = image["src"]
                url = urllib.request.urljoin(url, src)
                if url not in urls and count<109:
                    urls.append(url)
                    print(url)
                    download(url)
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)


def download(url):
    global count
    try:
        count = count + 1
        if (url[len(url) - 4] == "."):
            ext = url[len(url) - 4:]
        else:
            ext = ""
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("C:\\Users\\zwl\\Desktop\\images\\images1\\" + str(count) + ext, "wb")
        fobj.write(data)
        fobj.close()
        print("downloaded" + str(count) + ext)
    except Exception as err:
        print(err)

count = 0
url = "http://www.weather.com.cn/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31"}
name=['forecast','alarm','satellite','space','trip','life','air','climate','science']
for i in range(0,9): #九个页面
    if(count<109):
        url1=url;
        url1 = url1 + name[i] + '/'
        MySpider(url1)
print("一共爬取了"+str(count)+"张图片")

运行结果:


2. 多线程

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading


def MySpider1(start_url):
    global threads
    global count
    try:

        urls = []
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "lxml")
        images = soup.select("img")
        for image in images:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls and count<109:
                    print(url)
                    count = count + 1
                    T = threading.Thread(target=download, args=(url, count))
                    T.setDaemon(False)
                    T.start()
                    threads.append(T)
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)


def download(url, count):
    try:
        if (url[len(url) - 4] == "."):
            ext = url[len(url) - 4:]
        else:
            ext = ""
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("C:\\Users\\zwl\\Desktop\\images\\images2\\" + str(count) + ext, "wb")
        fobj.write(data)
        fobj.close()
        print("downloaded" + str(count) + ext)
    except Exception as err:
        print(err)

count = 0
url = "http://www.weather.com.cn/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31"}
threads = []

name=['forecast','alarm','satellite','space','trip','life','air','climate','science']
for i in range(0,9): #九个页面
    if count<109:
        url1=url
        url1 = url1 + name[i] + '/'
        t = threading.Thread(target=MySpider1, args=(url1,))
        threads.append(t)
        t.start()

for t in threads:
    t.join()
print("一共爬取了"+str(count)+"张图片")

运行结果:

心得体会

该爬虫分别以单线程、多线程的方式爬取网页图片。其中在多线程的MySpider1函数中,爬虫发送请求获取响应,并使用BeautifulSoup解析响应内容。然后,在解析过程中找到所有的图片标签,并提取每个图片的URL。通过使用线程,同时下载多个图片,实现并发下载。下载完成后,将图片保存在指定目录下。

作业②

爬取股票相关信息

爬取东方财富网股票信息

  1. items.py文件
import scrapy

# 3-2
class StocksItem(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field()
    t1 = scrapy.Field()
    t2 = scrapy.Field()
    t3 = scrapy.Field()
    t4 = scrapy.Field()
    t5 = scrapy.Field()
    t6 = scrapy.Field()
    t7 = scrapy.Field()
    t8 = scrapy.Field()
    t9 = scrapy.Field()
    t10 = scrapy.Field()
    t11 = scrapy.Field()
  1. 3-2.py
import json
import scrapy
from demo1.items import StocksItem

class Stocks(scrapy.Spider):
    name = 'Stocks'
    allowed_domains = ['quote.eastmoney.com']

    total_page_num = 0
    base_url = r'http://36.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124017913335698798893_1696658430311&pn={}' \
               r'&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web' \
               r'&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,' \
               r'f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696658430312'
    start_urls = [base_url.format(1)]

    def parse(self, response):
        data = response.text
        data = data.split('(', maxsplit=1)[1]
        data = data[:-2]
        datas = json.loads(data)
        data_list = datas['data']['diff']

        for data in data_list:
            dic = StocksItem()
            dic['name'] = str(data['f14'])
            dic['price'] = str(data['f2'])
            dic['t1'] = str(data['f3']) + "%"
            dic['t2'] = str(data['f4'])
            dic['t3'] = str(data['f5'] / 1000.0)
            dic['t4'] = str(data['f6'] / 100000000.0)
            dic['t5'] = str(data['f7']) + "%"
            dic['t6'] = str(data['f15'])
            dic['t7'] = str(data['f16'])
            dic['t8'] = str(data['f17'])
            dic['t9'] = str(data['f18'])
            dic['t10'] = str(data['f10'])
            dic['t11'] = str(data['f12'])
            print(dic)
            yield dic
  1. pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import pymysql
import sqlite3
#作业3-2

class StockPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="MyDB",
                                       charset="utf8")  #############
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks")  # 删除数据库中的 "stocks" 表的所有数据
            self.opened = True
            self.count = 0
            self.cursor.execute("create table if not exists stocks "
                                "(name varchar(10),price varchar(10),t1 varchar(10),"
                                "t2 varchar(10),t3 varchar(10),t4 varchar(10),t5 varchar(10),t6 varchar(10),t7 varchar(10)"
                                ",t8 varchar(10),t9 varchar(10),t10 varchar(10),t11 varchar(10),"
                                " primary key (name))")
        except Exception as err:
            print(err)
            self.opened = False

    def process_item(self, item, spider):
        insert = "insert into stocks (name,price,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11) " \
                 "values (?,?,?,?,?,?,?,?,?,?,?,?,?)"
        self.cursor.execute(insert, (
        item['name'], item['price'], item['t1'], item['t2'], item['t3'], item['t4'], item['t5'],
        item['t6'], item['t7'], item['t8'], item['t9'], item['t10'], item['t11']))
        return item

    def close_spider(self, spider):
        self.con.commit()
        self.con.close()
        print("closed")
  1. settings.py
BOT_NAME = "demo1"

SPIDER_MODULES = ["demo1.spiders"]
NEWSPIDER_MODULE = "demo1.spiders"


ROBOTSTXT_OBEY = True

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

ITEM_PIPELINES = {
'demo1.pipelines.StockPipeline': 300,#3-2

}

运行结果:

心得体会

这里使用Scrapy框架编写的网络爬虫,用于从东方财富网获取股票数据。其中,在parse方法中,爬虫发送请求获取响应,并通过处理响应文本数据提取所需的股票信息。通过使用json.loads将响应文本转换为字典对象,然后从字典中提取所需的字段,创建一个名为StocksItem的实例,并设置相应的属性值。最后,通过使用yield关键字返回dic对象,将数据传递给Scrapy框架进行处理。

作业③

爬取招商银行网站数据

  1. items.py文件
import scrapy
class BankItem(scrapy.Item):
    Currency=scrapy.Field()
    TBP=scrapy.Field()
    CBP=scrapy.Field()
    TSP=scrapy.Field()
    CSP=scrapy.Field()
    Time=scrapy.Field()
    pass

  1. 3-3.py
import scrapy
from demo2.items import BankItem

class BankSpider(scrapy.Spider):
    name="BankSpider"
    allowed_domains=['www.boc.cn']
    start_urls=['http://www.boc.cn/sourcedb/whpj/']
    base_url='https://www.boc.cn/sourcedb/whpj/index_'
    end_url='.html'
    page=0
    def parse(self,response):
        trs=response.xpath('//tr')
        for tr in trs[2:29]:
            Currency=tr.xpath('./td[1]/text()').extract_first()
            if tr.xpath('./td[2]/text()').extract_first() is not None:
                TBP=tr.xpath('./td[2]/text()').extract_first()
            else:TBP='none'

            if tr.xpath('./td[3]/text()').extract_first() is not None:
                CBP=tr.xpath('./td[3]/text()').extract_first()
            else:CBP='none'

            if tr.xpath('./td[4]/text()').extract_first() is not None:
                TSP=tr.xpath('./td[4]/text()').extract_first()
            else:TSP='none'
            if tr.xpath('./td[5]/text()').extract_first() is not None:
                CSP=tr.xpath('./td[5]/text()').extract_first()
            else:CSP='none'
            Time=tr.xpath('./td[7]/text()').extract_first()
            book=BankItem(Currency=Currency,TBP=TBP,CBP=CBP,TSP=TSP,CSP=CSP,Time=Time)
            yield book

        if self.page < 2:
            self.page+=1
            url=self.base_url+str(self.page)+self.end_url
            yield scrapy.Request(url=url,callback=self.parse)

  1. pipelines.py
from itemadapter import ItemAdapter

import pymysql
import sqlite3
class BankPipeline:
    def __init__(self):
        self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="MyDB",
                                   charset="utf8")
        self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
        try:
            self.cursor.execute(
                "create table Bank (Currency varchar,TBP varchar,CBP varchar, TSP varchar,CSP varchar,Time varchar)")
        except:
            self.cursor.execute("delete from Bank")
    def process_item(self, item, spider):
        try:
            Currency=item.get('Currency')
            TBP=item.get('TBP')
            CBP=item.get('CBP')
            TSP=item.get('TSP')
            CSP=item.get('CSP')
            Time=item.get('Time')
            #连接数据库
            self.con = sqlite3.connect("MyDB.db")
            self.cursor = self.con.cursor()
            #插入数据
            self.cursor.execute(
                "insert into Bank (Currency,TBP,CBP,TSP,CSP,Time) values(?,?,?,?,?,?)"
                ,(str(Currency),str(TBP),str(CBP),str(TSP),str(CSP),str(Time)))
            self.con.commit()

        except Exception as err:
            print(err)
        return item

  1. settings.py
BOT_NAME = "demo2"

SPIDER_MODULES = ["demo2.spiders"]
NEWSPIDER_MODULE = "demo2.spiders"

ROBOTSTXT_OBEY = True

REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

BOT_NAME = "Bank"

SPIDER_MODULES = ["Bank.spiders"]
NEWSPIDER_MODULE = "Bank.spiders"

ITEM_PIPELINES = {
    "demo2.pipelines.BankPipeline": 300,
}

运行结果:

心得体会

该爬虫用于从中国银行外汇牌价网站上获取外汇汇率数据。其中在parse方法中,爬虫通过XPath选择器解析响应,提取每一行的汇率数据,并创建一个名为BankItem的实例来保存提取的数据。最后,通过使用yield关键字返回book对象,将数据传递给Scrapy框架进行处理。如果有多个页面需要爬取,爬虫会继续请求下一页并调用parse方法进行解析。