作业①:
1)、要求:指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
题目1链接:题目1
具体代码如下:
MySpider.py
import scrapy
from scrapy.selector import Selector
from ..items import PictureItem
class MySpider(scrapy.Spider):
name = "mySpider"
def start_requests(self):
start_url ='http://www.weather.com.cn/'
yield scrapy.Request(url=start_url, callback=self.parse)
def parse(self,response):
try:
data = response.text
selector = Selector(text=data)
s1 = selector.xpath("/html/body//img/@src").extract()
for i in range(len(s1)):
item = PictureItem()
link = s1[i]
item["image_urls"] = link
yield item
except Exception as err:
print(err)
items.py
import scrapy
class PictureItem(scrapy.Item):
image_urls = scrapy.Field()
pipelines.py
import requests
class PicturePipeline:
num = 1
def process_item(self, item, spider):
url = item["image_urls"]
print("正在爬取第"+str(PicturePipeline.num)+"张图片")
print(url)
a = requests.get(url)
link = url.split("/")[-1]
with open("C:/Users/19350/Desktop/images/"+str(PicturePipeline.num)+"-"+link,'wb') as f:
f.write(a.content)
PicturePipeline.num = PicturePipeline.num +1
return item
settings.py
BOT_NAME = "picture"
SPIDER_MODULES = ["picture.spiders"]
NEWSPIDER_MODULE = "picture.spiders"
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
"picture.pipelines.PicturePipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
run.py
from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())
运行结果如下:


2)、心得体会
代码首先访问网站并获取该网站所有照片的链接;然后在pipelines中分别访问每张照片的链接,输出图片的链接并爬取图片。通过本次作业,我学会了如何通过scrapy实现爬取网页图片,对scrapy有了一定的了解。
作业②:
1)、要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。候选网站:东方财富网:https://www.eastmoney.com/
题目2链接:题目2
具体代码如下:
MySpider.py
import scrapy
import re
from ..items import StockItem
from ..pipelines import Database
class MySpider(scrapy.Spider):
name = "mySpider"
db = Database()
db.create_table()
def start_requests(self):
for page in range(1,3):
start_url ="http://6.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240015414280997085639_1696665041821&pn="+str(page)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=1696665041822"
yield scrapy.Request(url=start_url,callback=self.parse)
def parse(self,response):
url = response.url
p = url.split("&pn=")[1]
page = p.split("&pz")[0]
page = eval(page)
text = response.text
data = re.findall(r'"diff":\[(.*?)\]', text)
data = list(eval(data[0]))
num = 1
for i in data:
item = StockItem()
item["num"] = num+(page-1)*20
item["s1"] = i["f12"]
item["s2"] = i["f14"]
item["s3"] = i["f2"]
item["s4"] = i["f3"]
item["s5"] = i["f4"]
item["s6"] = i["f5"]
item["s7"] = i["f6"]
item["s8"] = i["f7"]
item["s9"] = i["f15"]
item["s10"] = i["f16"]
item["s11"] = i["f17"]
item["s12"] = i["f18"]
num = num+1
yield item
items.py
import scrapy
class StockItem(scrapy.Item):
num = scrapy.Field()
s1 = scrapy.Field()
s2 = scrapy.Field()
s3 = scrapy.Field()
s4 = scrapy.Field()
s5 = scrapy.Field()
s6 = scrapy.Field()
s7 = scrapy.Field()
s8 = scrapy.Field()
s9 = scrapy.Field()
s10 = scrapy.Field()
s11 = scrapy.Field()
s12 = scrapy.Field()
pipelines.py
import sqlite3
class Database:
conn = sqlite3.connect("stock.db")
cursor = conn.cursor()
def create_table(self):
try:
self.cursor.execute('''
CREATE TABLE stock (
num TEXT,
s1 TEXT,
s2 TEXT,
s3 TEXT,
s4 TEXT,
s5 TEXT,
s6 TEXT,
s7 TEXT,
s8 TEXT,
s9 TEXT,
s10 TEXT,
s11 TEXT,
s12 TEXT
);
''')
except:
self.cursor.execute("delete from stock")
def insert_data(self, num,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12):
self.cursor.execute("insert into stock (num,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
(num,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12))
def show(self):
self.cursor.execute("select * from stock")
rows = self.cursor.fetchall()
print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format("序号", "股票代码", "名称","最新报价","涨跌幅","跌涨额","成交量","成交额","振幅","最高","最低","今开","昨收"))
for row in rows:
print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12]))
def close(self):
self.conn.commit()
self.conn.close()
class StockPipeline:
count = 0
def process_item(self, item, spider):
StockPipeline.count=StockPipeline.count+1
db = Database()
num = item["num"]
s1 = item["s1"]
s2 = item["s2"]
s3 = item["s3"]
s4 = item["s4"]
s5 = item["s5"]
s6 = item["s6"]
s7 = item["s7"]
s8 = item["s8"]
s9 = item["s9"]
s10 = item["s10"]
s11 = item["s11"]
s12 = item["s12"]
db.insert_data(num, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12)
if StockPipeline.count==40:
db.show()
return item
settings.py
BOT_NAME = "stock"
SPIDER_MODULES = ["stock.spiders"]
NEWSPIDER_MODULE = "stock.spiders"
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1
ITEM_PIPELINES = {
"stock.pipelines.StockPipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
run.py
from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())
运行结果如下:

2)、心得体会
代码首先访问url并且获取数据;在pipelines中实现分别把每条数据保存到数据库当中,同时控制当存储的数据达到一定数量的时候,输出保存的数据。通过本次作业,我对通过scrapy爬取数据更加了解。
作业③:
1)、要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
题目3链接:题目3
具体代码如下:
MySpider.py
import scrapy
from scrapy.selector import Selector
from ..items import DataItem
from ..pipelines import Database
class MySpider(scrapy.Spider):
name = "mySpider"
db = Database()
db.create_table()
def start_requests(self):
url = "https://www.boc.cn/sourcedb/whpj/"
yield scrapy.Request(url=url,callback=self.parse)
def parse(self,response):
text = response.text
selector = Selector(text=text)
s = selector.xpath("/html/body/div/div/div/div/table/tr[position()>1]")
for i in s:
item = DataItem()
item["s1"] = i.xpath("./td[1]/text()").extract_first()
item["s2"] = i.xpath("./td[2]/text()").extract_first()
item["s3"] = i.xpath("./td[3]/text()").extract_first()
item["s4"] = i.xpath("./td[4]/text()").extract_first()
item["s5"] = i.xpath("./td[5]/text()").extract_first()
item["s6"] = i.xpath("./td[6]/text()").extract_first()
item["s7"] = i.xpath("./td[7]/text()").extract_first()
item["s8"] = i.xpath("./td[8]/text()").extract_first()
yield item
items.py
import scrapy
class DataItem(scrapy.Item):
s1 = scrapy.Field()
s2 = scrapy.Field()
s3 = scrapy.Field()
s4 = scrapy.Field()
s5 = scrapy.Field()
s6 = scrapy.Field()
s7 = scrapy.Field()
s8 = scrapy.Field()
pipelines
import sqlite3
class Database:
conn = sqlite3.connect("data.db")
cursor = conn.cursor()
def create_table(self):
try:
self.cursor.execute('''
CREATE TABLE data (
s1 TEXT,
s2 TEXT,
s3 TEXT,
s4 TEXT,
s5 TEXT,
s6 TEXT,
s7 TEXT,
s8 TEXT
);
''')
except:
self.cursor.execute("delete from data")
def insert_data(self,s1,s2,s3,s4,s5,s6,s7,s8):
self.cursor.execute("insert into data (s1,s2,s3,s4,s5,s6,s7,s8) values (?,?,?,?,?,?,?,?)",
(s1,s2,s3,s4,s5,s6,s7,s8))
def show(self):
self.cursor.execute("select * from data")
rows = self.cursor.fetchall()
print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format("货币名称", "现汇买入价", "现钞买入价","现汇卖出价","现钞卖出价","中行折算价","发布日期","发布时间"))
for row in rows:
print("{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}".format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7]))
def close(self):
self.conn.commit()
self.conn.close()
class DataPipeline:
count = 0
def process_item(self, item, spider):
DataPipeline.count = DataPipeline.count+1
db = Database()
s1 = item["s1"]
if item["s2"]==None:
s2 = "NULL"
else:
s2 = item["s2"]
if item["s3"]==None:
s3 = "NULL"
else:
s3 = item["s3"]
if item["s4"]==None:
s4 = "NULL"
else:
s4 = item["s4"]
if item["s5"]==None:
s5 = "NULL"
else:
s5 = item["s5"]
if item["s6"] == None:
s6 = "NULL"
else:
s6 = item["s6"]
s7 = item["s7"]
s8 = item["s8"]
db.insert_data(s1, s2, s3, s4, s5, s6, s7, s8)
if DataPipeline.count==27:
db.show()
return item
settings.py
BOT_NAME = "data"
SPIDER_MODULES = ["data.spiders"]
NEWSPIDER_MODULE = "data.spiders"
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
"data.pipelines.DataPipeline": 300,
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
run.py
from scrapy import cmdline
cmdline.execute("scrapy crawl mySpider -s LOG_ENABLED=False".split())
运行结果如下:

2)、心得体会
代码同样是先访问url获取数据,然后在pipelines中实现数据的保存和输出。通过本次作业,我对scrapy的使用更加熟练。