2023数据采集与融合技术实践作业三

发布时间 2023-10-31 21:46:56作者: 溯-

作业①:

1)、要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。

题目1链接:题目1
具体代码如下:
MySpider.py

import scrapy
from scrapy.selector import Selector
from ..items import PictureItem
class MySpider(scrapy.Spider):
    name = "mySpider"
    def start_requests(self):
        start_url ='http://www.weather.com.cn/'
        yield scrapy.Request(url=start_url, callback=self.parse)
    def parse(self,response):
        try:
            data = response.text
            selector = Selector(text=data)
            s1 = selector.xpath("/html/body//img/@src").extract()
            for i in range(len(s1)):
                item = PictureItem()
                link = s1[i]
                item["image_urls"] = link
                yield item
        except Exception as err:
            print(err)

items.py

import scrapy
class PictureItem(scrapy.Item):
    image_urls = scrapy.Field()

pipelines.py

import requests
class PicturePipeline:
    num = 1
    def process_item(self, item, spider):
        url = item["image_urls"]
        print("正在爬取第"+str(PicturePipeline.num)+"张图片")
        print(url)
        a = requests.get(url)
        link = url.split("/")[-1]
        with open("C:/Users/19350/Desktop/images/"+str(PicturePipeline.num)+"-"+link,'wb') as f:
            f.write(a.content)
        PicturePipeline.num = PicturePipeline.num +1
        return item

settings.py

BOT_NAME = "picture"
SPIDER_MODULES = ["picture.spiders"]
NEWSPIDER_MODULE = "picture.spiders"
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
    "picture.pipelines.PicturePipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

run.py

from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())

运行结果如下:
image
image

2)、心得体会

代码首先访问网站并获取该网站所有照片的链接;然后在pipelines中分别访问每张照片的链接,输出图片的链接并爬取图片。通过本次作业,我学会了如何通过scrapy实现爬取网页图片,对scrapy有了一定的了解。

作业②:

1)、要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。候选网站:东方财富网:https://www.eastmoney.com/

题目2链接:题目2
具体代码如下:
MySpider.py

import scrapy
import re
from ..items import StockItem
from ..pipelines import Database
class MySpider(scrapy.Spider):
    name = "mySpider"
    db = Database()
    db.create_table()
    def start_requests(self):
        for page in range(1,3):
            start_url ="http://6.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240015414280997085639_1696665041821&pn="+str(page)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=1696665041822"
            yield scrapy.Request(url=start_url,callback=self.parse)
    def parse(self,response):
        url = response.url
        p = url.split("&pn=")[1]
        page = p.split("&pz")[0]
        page = eval(page)
        text = response.text
        data = re.findall(r'"diff":\[(.*?)\]', text)
        data = list(eval(data[0]))
        num = 1
        for i in data:
            item = StockItem()
            item["num"] = num+(page-1)*20
            item["s1"] = i["f12"]
            item["s2"] = i["f14"]
            item["s3"] = i["f2"]
            item["s4"] = i["f3"]
            item["s5"] = i["f4"]
            item["s6"] = i["f5"]
            item["s7"] = i["f6"]
            item["s8"] = i["f7"]
            item["s9"] = i["f15"]
            item["s10"] = i["f16"]
            item["s11"] = i["f17"]
            item["s12"] = i["f18"]
            num = num+1
            yield item

items.py

import scrapy
class StockItem(scrapy.Item):
    num = scrapy.Field()
    s1 = scrapy.Field()
    s2 = scrapy.Field()
    s3 = scrapy.Field()
    s4 = scrapy.Field()
    s5 = scrapy.Field()
    s6 = scrapy.Field()
    s7 = scrapy.Field()
    s8 = scrapy.Field()
    s9 = scrapy.Field()
    s10 = scrapy.Field()
    s11 = scrapy.Field()
    s12 = scrapy.Field()

pipelines.py

import sqlite3
class Database:
    conn = sqlite3.connect("stock.db")
    cursor = conn.cursor()
    def create_table(self):
        try:
            self.cursor.execute('''  
                CREATE TABLE stock (  
                    num TEXT,
                    s1 TEXT,
                    s2 TEXT,
                    s3 TEXT,
                    s4 TEXT,
                    s5 TEXT,
                    s6 TEXT,
                    s7 TEXT,
                    s8 TEXT,
                    s9 TEXT,
                    s10 TEXT,
                    s11 TEXT,
                    s12 TEXT
                );  
            ''')
        except:
            self.cursor.execute("delete from stock")
    def insert_data(self, num,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12):
        self.cursor.execute("insert into stock (num,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                            (num,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12))
    def show(self):
        self.cursor.execute("select * from stock")
        rows = self.cursor.fetchall()
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format("序号", "股票代码", "名称","最新报价","涨跌幅","跌涨额","成交量","成交额","振幅","最高","最低","今开","昨收"))
        for row in rows:
            print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12]))
    def close(self):
        self.conn.commit()
        self.conn.close()
class StockPipeline:
    count = 0
    def process_item(self, item, spider):
        StockPipeline.count=StockPipeline.count+1
        db = Database()
        num = item["num"]
        s1 = item["s1"]
        s2 = item["s2"]
        s3 = item["s3"]
        s4 = item["s4"]
        s5 = item["s5"]
        s6 = item["s6"]
        s7 = item["s7"]
        s8 = item["s8"]
        s9 = item["s9"]
        s10 = item["s10"]
        s11 = item["s11"]
        s12 = item["s12"]
        db.insert_data(num, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12)
        if StockPipeline.count==40:
            db.show()
        return item

settings.py

BOT_NAME = "stock"
SPIDER_MODULES = ["stock.spiders"]
NEWSPIDER_MODULE = "stock.spiders"
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1
ITEM_PIPELINES = {
    "stock.pipelines.StockPipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

run.py

from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())

运行结果如下:
image

2)、心得体会

代码首先访问url并且获取数据;在pipelines中实现分别把每条数据保存到数据库当中,同时控制当存储的数据达到一定数量的时候,输出保存的数据。通过本次作业,我对通过scrapy爬取数据更加了解。

作业③:

1)、要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/

题目3链接:题目3
具体代码如下:
MySpider.py

import scrapy
from scrapy.selector import Selector
from ..items import DataItem
from ..pipelines import Database
class MySpider(scrapy.Spider):
    name = "mySpider"
    db = Database()
    db.create_table()
    def start_requests(self):
        url = "https://www.boc.cn/sourcedb/whpj/"
        yield scrapy.Request(url=url,callback=self.parse)
    def parse(self,response):
        text = response.text
        selector = Selector(text=text)
        s = selector.xpath("/html/body/div/div/div/div/table/tr[position()>1]")
        for i in s:
            item = DataItem()
            item["s1"] = i.xpath("./td[1]/text()").extract_first()
            item["s2"] = i.xpath("./td[2]/text()").extract_first()
            item["s3"] = i.xpath("./td[3]/text()").extract_first()
            item["s4"] = i.xpath("./td[4]/text()").extract_first()
            item["s5"] = i.xpath("./td[5]/text()").extract_first()
            item["s6"] = i.xpath("./td[6]/text()").extract_first()
            item["s7"] = i.xpath("./td[7]/text()").extract_first()
            item["s8"] = i.xpath("./td[8]/text()").extract_first()
            yield item

items.py

import scrapy
class DataItem(scrapy.Item):
    s1 = scrapy.Field()
    s2 = scrapy.Field()
    s3 = scrapy.Field()
    s4 = scrapy.Field()
    s5 = scrapy.Field()
    s6 = scrapy.Field()
    s7 = scrapy.Field()
    s8 = scrapy.Field()

pipelines

import sqlite3
class Database:
    conn = sqlite3.connect("data.db")
    cursor = conn.cursor()
    def create_table(self):
        try:
            self.cursor.execute('''  
                CREATE TABLE data (  
                    s1 TEXT,
                    s2 TEXT,
                    s3 TEXT,
                    s4 TEXT,
                    s5 TEXT,
                    s6 TEXT,
                    s7 TEXT,
                    s8 TEXT
                );  
            ''')
        except:
            self.cursor.execute("delete from data")
    def insert_data(self,s1,s2,s3,s4,s5,s6,s7,s8):
        self.cursor.execute("insert into data (s1,s2,s3,s4,s5,s6,s7,s8) values (?,?,?,?,?,?,?,?)",
                            (s1,s2,s3,s4,s5,s6,s7,s8))
    def show(self):
        self.cursor.execute("select * from data")
        rows = self.cursor.fetchall()
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format("货币名称", "现汇买入价", "现钞买入价","现汇卖出价","现钞卖出价","中行折算价","发布日期","发布时间"))
        for row in rows:
            print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7]))
    def close(self):
        self.conn.commit()
        self.conn.close()

class DataPipeline:
    count = 0
    def process_item(self, item, spider):
        DataPipeline.count = DataPipeline.count+1
        db = Database()
        s1 = item["s1"]
        if item["s2"]==None:
            s2 = "NULL"
        else:
            s2 = item["s2"]
        if item["s3"]==None:
            s3 = "NULL"
        else:
            s3 = item["s3"]
        if item["s4"]==None:
            s4 = "NULL"
        else:
            s4 = item["s4"]
        if item["s5"]==None:
            s5 = "NULL"
        else:
            s5 = item["s5"]
        if item["s6"] == None:
            s6 = "NULL"
        else:
            s6 = item["s6"]
        s7 = item["s7"]
        s8 = item["s8"]
        db.insert_data(s1, s2, s3, s4, s5, s6, s7, s8)
        if DataPipeline.count==27:
            db.show()
        return item

settings.py

BOT_NAME = "data"
SPIDER_MODULES = ["data.spiders"]
NEWSPIDER_MODULE = "data.spiders"
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
    "data.pipelines.DataPipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

run.py

from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())

运行结果如下:
image

2)、心得体会

代码同样是先访问url获取数据,然后在pipelines中实现数据的保存和输出。通过本次作业,我对scrapy的使用更加熟练。