作业①
爬取中国气象网实验
- 实验要求
指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。 - 输出信息:
将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。 - 具体实现
代码思路:

gitee文件夹链接
代码展示:
单线程:
import scrapy
from bs4 import UnicodeDammit
import urllib.request
import os
class MySpider(scrapy.Spider):
name = "dan"
start_urls = ['http://www.weather.com.cn/weather/101280601.shtml']
headers = {
"User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"
}
def parse(self,response):
try:
# 将网页内容转换为Unicode编码
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
# 使用Scrapy的Selector选择器解析网页内容
selector = scrapy.Selector(text=data)
# 提取所有图片的URL
srcs = selector.xpath("//img/@src")
for src in srcs:
print(src.extract())
# 调用下载函数下载图片
self.download(src.extract())
except Exception as err:
print(err)
def download(self,url):
try:
# 判断URL的扩展名
if (url[len(url) - 4] == "."):
ext = url[len(url) - 4:]
else:
ext = ""
# 检查是否存在名为"images"的文件夹,如果不存在则创建它
if not os.path.exists('images'):
os.makedirs('images')
# 构造请求对象,并发送HTTP请求获取图片数据
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
# 获取已下载图片的数量,用于生成文件名
count = len(os.listdir('images')) + 1
# 将图片数据写入文件
fobj = open("images\\" + str(count) + ext, "wb")
fobj.write(data)
fobj.close()
print("downloaded" + str(count) + ext)
except Exception as err:
print(err)
多线程:
import scrapy
from bs4 import UnicodeDammit
import urllib.request
import threading
import os
class MySpider(scrapy.Spider):
name = "mySpider"
start_urls = ['http://www.weather.com.cn/weather/101280601.shtml']
count = 0
threads = []
# 等待所有线程执行完毕
for t in threads:
t.join()
headers = {
"User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"
}
def parse(self,response):
try:
# 将网页内容转换为Unicode编码
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
# 使用Scrapy的Selector选择器解析网页内容
selector = scrapy.Selector(text=data)
# 提取所有图片的URL
srcs = selector.xpath("//img/@src")
for src in srcs:
print(src.extract())
# 递增计数器
self.count = self.count + 1
# 创建并启动新线程来下载图片
T = threading.Thread(target=self.download, args=(src.extract(), self.count))
T.setDaemon(False)
T.start()
self.threads.append(T)
except Exception as err:
print(err)
def download(self,url, count):
try:
# 判断URL的扩展名
if (url[len(url) - 4] == "."):
ext = url[len(url) - 4:]
else:
ext = ""
# 检查是否存在名为"images"的文件夹,如果不存在则创建它
if not os.path.exists('images'):
os.makedirs('images')
# 构造请求对象,并发送HTTP请求获取图片数据
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
# 将图片数据写入文件
fobj = open("images\\" + str(count) + ext, "wb")
fobj.write(data)
fobj.close()
print("downloaded" + str(count) + ext)
except Exception as err:
print(err)
pipelines类:
import os
import urllib
class GetpicturePipeline:
count = 1
urllist = []
def process_item(self, item, spider):
GetpicturePipeline.count += 1
try:
if not os.path.exists('images'):
os.makedirs('imgages')
if item['url'] not in GetpicturePipeline.urllist:
data = urllib.request.urlopen(item['url']).read()
with open('image/'+str(GetpicturePipeline.count)+'.jpg',"wb") as f:
f.write(data)
except Exception as err:
print(err)
return item
items类:
import scrapy
class PictureItem(scrapy.Item):
# define the fields for your item here like:
url = scrapy.Field()
pass
运行结果:
单线程:


多线程:


心得体会
深入的理解了python多线程的应用和scrapy框架,理解了线程之间并发执行的过程(输出结果来看特别明显),也通过下载图片的总时间比较感受到了多线程的效率。
作业②
爬取股票相关信息实验
- 实验要求
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站:东方财富网:https://www.eastmoney.com/ - 输出信息:
| 序号 | 股票代码 | 股票名称 | 最新报价 | 涨跌幅 | 涨跌额 | 成交量 | 振幅 | 最高 | 最低 | 今开 | 昨收 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 688093 | N世华 | 28.47 | 10.92 | 26.13万 | 7.6亿 | 22.34 | 32.0 | 28.08 | 30.20 | 17.55 |
- 具体实现
gitee文件夹链接
代码展示:
import scrapy
import re
import json
from demo.items import StocksItem
import math
class StocksSpider(scrapy.Spider):
name = 'stocks'
start_urls = [
'http://31.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409705185363781139_1602849464971&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1602849464977']
def parse(self, response):
try:
data = response.body.decode()
datas = re.findall("{.*?}", data[re.search("\[", data).start():]) # 获取每支股票信息,一个{...}对应一支
for n in range(len(datas)):
stock = json.loads(datas[n]) # 文本解析成json格式
item = StocksItem() # 获取相应的变量
item['code'] = stock['f12']
item['name'] = stock['f14']
item['latest_price'] = str(stock['f2'])
item['range'] = str(stock['f3'])
item['amount'] = str(stock['f4'])
item['trading'] = str(stock['f5'])
yield item
all_page = math.ceil(eval(re.findall('"total":(\d+)', response.body.decode())[0]) / 20) # 获取页数
page = re.findall("pn=(\d+)", response.url)[0] # 当前页数
if int(page) < 3: # 判断页数
url = response.url.replace("pn=" + page, "pn=" + str(int(page) + 1)) # 跳转下一页
yield scrapy.Request(url=url, callback=self.parse) # 函数回调
except Exception as err:
print(err)
pipelines类:
import mysql.connector
class StocksPipeline:
def __init__(self):
self.create_connection() # 创建数据库连接
self.create_table() # 创建stocks表
def create_connection(self):
self.conn = mysql.connector.connect(
port=3307, # 数据库端口号
user='root', # 数据库用户名
password='123456', # 数据库密码
database='gupiao' # 数据库名
)
self.curr = self.conn.cursor() # 创建游标对象
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS stocks""") # 如果stocks表存在,删除该表
self.curr.execute("""CREATE TABLE stocks (
id INT AUTO_INCREMENT PRIMARY KEY,
code VARCHAR(255),
name VARCHAR(255),
latest_price FLOAT,
price_range FLOAT,
amount FLOAT,
trading FLOAT
)""") # 创建stocks表,包含6个字段
def process_item(self, item, spider):
self.store_db(item) # 将item对象中的数据存储到数据库中
return item
def store_db(self, item):
self.curr.execute("""INSERT INTO stocks (code, name, latest_price, price_range, amount, trading) VALUES (%s, %s, %s, %s, %s, %s)""",
(item['code'], item['name'], item['latest_price'], item['range'], item['amount'], item['trading'])) # 向stocks表插入数据
self.conn.commit() # 提交事务,保存数据到数据库
items类:
import scrapy
class StocksItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
code=scrapy.Field() #对象结构定义
name=scrapy.Field()
latest_price=scrapy.Field()
range=scrapy.Field()
amount=scrapy.Field()
trading=scrapy.Field()
setting:
ITEM_PIPELINES = {
'demo.pipelines.StocksPipeline': 300,
}
运行结果:

心得体会
本次实验是在上次实验的基础上进行添加数据库的操作,刚开始是数据库一直连接不上,后来发现是端口连接的问题,然后新的问题就是爬取不到内容,最后修改了代码解决了。
作业③
爬取外汇网站数据
- 实验要求
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站:中国银行网:https://www.boc.cn/sourcedb/whpj/
输出信息:
| Currency | TBP | CBP | TSP | CSP | Time |
|---|---|---|---|---|---|
| 阿联酋迪拉姆 | 198.58 | 192.31 | 199.98 | 206.59 | 11:27:14 |
- 具体实现
代码思路:

代码展示:
myspider类:
import scrapy
from bank.items import BankItem
class BankSpider(scrapy.Spider):
name = 'bank' # 爬虫名称
start_urls = ['https://www.boc.cn/sourcedb/whpj/index.html'] # 起始URL
def parse(self, response):
bank_list = response.xpath('//tr[position()>1]') # 获取银行汇率信息列表
for bank in bank_list:
item = BankItem() # 创建BankItem对象
# 提取银行汇率信息并赋值给item对象
item['Currency'] = bank.xpath('.//td[1]/text()').get()
item['TBP'] = bank.xpath('.//td[2]/text()').get()
item['CBP'] = bank.xpath('.//td[3]/text()').get()
item['TSP'] = bank.xpath('.//td[4]/text()').get()
item['CSP'] = bank.xpath('.//td[5]/text()').get()
item['Time'] = bank.xpath('.//td[8]/text()').get()
yield item # 返回item对象
def start_requests(self):
num_pages = int(getattr(self, 'pages', 4)) # 获取需要爬取的页数,默认为4页
for page in range(1, num_pages + 1):
if page == 1:
start_url = f'https://www.boc.cn/sourcedb/whpj/index.html'
else:
start_url = f'https://www.boc.cn/sourcedb/whpj/index_{page - 1}.html'
# 构造请求对象,并指定回调函数为parse
yield scrapy.Request(start_url, callback=self.parse)
items类
import scrapy
class BankItem(scrapy.Item):
Currency = scrapy.Field()
TSP = scrapy.Field()
CSP = scrapy.Field()
TBP = scrapy.Field()
CBP = scrapy.Field()
Time = scrapy.Field()
pipelines类
import pymysql
from itemadapter import ItemAdapter
class BankPipeline:
def open_spider(self, spider):
print("opened")
try:
# 连接MySQL数据库
self.con = pymysql.connect(port=3307, user="root", passwd="123456", db="bank", charset="utf8")
self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
# 创建bank表
self.cursor.execute("DROP TABLE IF EXISTS bank")
self.cursor.execute("CREATE TABLE IF NOT EXISTS bank("
"id int PRIMARY KEY,"
"Currency VARCHAR(32),"
"TSP VARCHAR(32),"
"CSP VARCHAR(32),"
"TBP VARCHAR(32),"
"CBP VARCHAR(32),"
"TIME VARCHAR(32))")
self.opened = True
self.count = 0
except Exception as err:
print(err)
self.opened = False
def close_spider(self, spider):
if self.opened:
self.con.commit() # 提交事务
self.con.close() # 关闭连接
self.opened = False
print("closed")
print("总共爬取", self.count, "条信息")
def process_item(self, item, spider):
try:
print(item)
if self.opened:
self.count += 1
# 向bank表插入数据
self.cursor.execute(
"insert into bank(id,Currency,TSP,CSP,TBP,CBP,Time) values(%s,%s,%s,%s,%s,%s,%s)",
(self.count, item["Currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["Time"]))
except Exception as err:
print(err)
return item
setting类
ITEM_PIPELINES = {
'bank.pipelines.BankPipeline': 300,
}
运行结果:


心得体会
本次作业的内容是上次股票爬取的升级版,其实就是比上次多了个要求——使用数据库来存储爬取的信息。Myspider的内容基本不变,主要是要对pipelines的内容进行编写,实现与数据库的对接,在数据库链接调试上了花费了很多时间,一直不显示爬取的内容,最后发现是没连接上。