作业①
爬取中国气象网
爬取中国气象网图片(单线程和多线程)
- 单线程
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
def MySpider(url):
global count
try:
urls = []
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
images = soup.select("img")
for image in images:
try:
src = image["src"]
url = urllib.request.urljoin(url, src)
if url not in urls and count<109:
urls.append(url)
print(url)
download(url)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url):
global count
try:
count = count + 1
if (url[len(url) - 4] == "."):
ext = url[len(url) - 4:]
else:
ext = ""
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
fobj = open("C:\\Users\\zwl\\Desktop\\images\\images1\\" + str(count) + ext, "wb")
fobj.write(data)
fobj.close()
print("downloaded" + str(count) + ext)
except Exception as err:
print(err)
count = 0
url = "http://www.weather.com.cn/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31"}
name=['forecast','alarm','satellite','space','trip','life','air','climate','science']
for i in range(0,9): #九个页面
if(count<109):
url1=url;
url1 = url1 + name[i] + '/'
MySpider(url1)
print("一共爬取了"+str(count)+"张图片")
运行结果:


2. 多线程
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
def MySpider1(start_url):
global threads
global count
try:
urls = []
req = urllib.request.Request(start_url, headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
images = soup.select("img")
for image in images:
try:
src = image["src"]
url = urllib.request.urljoin(start_url, src)
if url not in urls and count<109:
print(url)
count = count + 1
T = threading.Thread(target=download, args=(url, count))
T.setDaemon(False)
T.start()
threads.append(T)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url, count):
try:
if (url[len(url) - 4] == "."):
ext = url[len(url) - 4:]
else:
ext = ""
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
fobj = open("C:\\Users\\zwl\\Desktop\\images\\images2\\" + str(count) + ext, "wb")
fobj.write(data)
fobj.close()
print("downloaded" + str(count) + ext)
except Exception as err:
print(err)
count = 0
url = "http://www.weather.com.cn/"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31"}
threads = []
name=['forecast','alarm','satellite','space','trip','life','air','climate','science']
for i in range(0,9): #九个页面
if count<109:
url1=url
url1 = url1 + name[i] + '/'
t = threading.Thread(target=MySpider1, args=(url1,))
threads.append(t)
t.start()
for t in threads:
t.join()
print("一共爬取了"+str(count)+"张图片")
运行结果:


心得体会
该爬虫分别以单线程、多线程的方式爬取网页图片。其中在多线程的MySpider1函数中,爬虫发送请求获取响应,并使用BeautifulSoup解析响应内容。然后,在解析过程中找到所有的图片标签,并提取每个图片的URL。通过使用线程,同时下载多个图片,实现并发下载。下载完成后,将图片保存在指定目录下。
作业②
爬取股票相关信息
爬取东方财富网股票信息
- items.py文件
import scrapy
# 3-2
class StocksItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
t1 = scrapy.Field()
t2 = scrapy.Field()
t3 = scrapy.Field()
t4 = scrapy.Field()
t5 = scrapy.Field()
t6 = scrapy.Field()
t7 = scrapy.Field()
t8 = scrapy.Field()
t9 = scrapy.Field()
t10 = scrapy.Field()
t11 = scrapy.Field()
- 3-2.py
import json
import scrapy
from demo1.items import StocksItem
class Stocks(scrapy.Spider):
name = 'Stocks'
allowed_domains = ['quote.eastmoney.com']
total_page_num = 0
base_url = r'http://36.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124017913335698798893_1696658430311&pn={}' \
r'&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web' \
r'&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,' \
r'f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696658430312'
start_urls = [base_url.format(1)]
def parse(self, response):
data = response.text
data = data.split('(', maxsplit=1)[1]
data = data[:-2]
datas = json.loads(data)
data_list = datas['data']['diff']
for data in data_list:
dic = StocksItem()
dic['name'] = str(data['f14'])
dic['price'] = str(data['f2'])
dic['t1'] = str(data['f3']) + "%"
dic['t2'] = str(data['f4'])
dic['t3'] = str(data['f5'] / 1000.0)
dic['t4'] = str(data['f6'] / 100000000.0)
dic['t5'] = str(data['f7']) + "%"
dic['t6'] = str(data['f15'])
dic['t7'] = str(data['f16'])
dic['t8'] = str(data['f17'])
dic['t9'] = str(data['f18'])
dic['t10'] = str(data['f10'])
dic['t11'] = str(data['f12'])
print(dic)
yield dic
- pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import pymysql
import sqlite3
#作业3-2
class StockPipeline:
def open_spider(self, spider):
print("opened")
try:
self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="MyDB",
charset="utf8") #############
self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
self.cursor.execute("delete from stocks") # 删除数据库中的 "stocks" 表的所有数据
self.opened = True
self.count = 0
self.cursor.execute("create table if not exists stocks "
"(name varchar(10),price varchar(10),t1 varchar(10),"
"t2 varchar(10),t3 varchar(10),t4 varchar(10),t5 varchar(10),t6 varchar(10),t7 varchar(10)"
",t8 varchar(10),t9 varchar(10),t10 varchar(10),t11 varchar(10),"
" primary key (name))")
except Exception as err:
print(err)
self.opened = False
def process_item(self, item, spider):
insert = "insert into stocks (name,price,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11) " \
"values (?,?,?,?,?,?,?,?,?,?,?,?,?)"
self.cursor.execute(insert, (
item['name'], item['price'], item['t1'], item['t2'], item['t3'], item['t4'], item['t5'],
item['t6'], item['t7'], item['t8'], item['t9'], item['t10'], item['t11']))
return item
def close_spider(self, spider):
self.con.commit()
self.con.close()
print("closed")
- settings.py
BOT_NAME = "demo1"
SPIDER_MODULES = ["demo1.spiders"]
NEWSPIDER_MODULE = "demo1.spiders"
ROBOTSTXT_OBEY = True
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
ITEM_PIPELINES = {
'demo1.pipelines.StockPipeline': 300,#3-2
}
运行结果:

心得体会
这里使用Scrapy框架编写的网络爬虫,用于从东方财富网获取股票数据。其中,在parse方法中,爬虫发送请求获取响应,并通过处理响应文本数据提取所需的股票信息。通过使用json.loads将响应文本转换为字典对象,然后从字典中提取所需的字段,创建一个名为StocksItem的实例,并设置相应的属性值。最后,通过使用yield关键字返回dic对象,将数据传递给Scrapy框架进行处理。
作业③
爬取招商银行网站数据
- items.py文件
import scrapy
class BankItem(scrapy.Item):
Currency=scrapy.Field()
TBP=scrapy.Field()
CBP=scrapy.Field()
TSP=scrapy.Field()
CSP=scrapy.Field()
Time=scrapy.Field()
pass
- 3-3.py
import scrapy
from demo2.items import BankItem
class BankSpider(scrapy.Spider):
name="BankSpider"
allowed_domains=['www.boc.cn']
start_urls=['http://www.boc.cn/sourcedb/whpj/']
base_url='https://www.boc.cn/sourcedb/whpj/index_'
end_url='.html'
page=0
def parse(self,response):
trs=response.xpath('//tr')
for tr in trs[2:29]:
Currency=tr.xpath('./td[1]/text()').extract_first()
if tr.xpath('./td[2]/text()').extract_first() is not None:
TBP=tr.xpath('./td[2]/text()').extract_first()
else:TBP='none'
if tr.xpath('./td[3]/text()').extract_first() is not None:
CBP=tr.xpath('./td[3]/text()').extract_first()
else:CBP='none'
if tr.xpath('./td[4]/text()').extract_first() is not None:
TSP=tr.xpath('./td[4]/text()').extract_first()
else:TSP='none'
if tr.xpath('./td[5]/text()').extract_first() is not None:
CSP=tr.xpath('./td[5]/text()').extract_first()
else:CSP='none'
Time=tr.xpath('./td[7]/text()').extract_first()
book=BankItem(Currency=Currency,TBP=TBP,CBP=CBP,TSP=TSP,CSP=CSP,Time=Time)
yield book
if self.page < 2:
self.page+=1
url=self.base_url+str(self.page)+self.end_url
yield scrapy.Request(url=url,callback=self.parse)
- pipelines.py
from itemadapter import ItemAdapter
import pymysql
import sqlite3
class BankPipeline:
def __init__(self):
self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="MyDB",
charset="utf8")
self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
try:
self.cursor.execute(
"create table Bank (Currency varchar,TBP varchar,CBP varchar, TSP varchar,CSP varchar,Time varchar)")
except:
self.cursor.execute("delete from Bank")
def process_item(self, item, spider):
try:
Currency=item.get('Currency')
TBP=item.get('TBP')
CBP=item.get('CBP')
TSP=item.get('TSP')
CSP=item.get('CSP')
Time=item.get('Time')
#连接数据库
self.con = sqlite3.connect("MyDB.db")
self.cursor = self.con.cursor()
#插入数据
self.cursor.execute(
"insert into Bank (Currency,TBP,CBP,TSP,CSP,Time) values(?,?,?,?,?,?)"
,(str(Currency),str(TBP),str(CBP),str(TSP),str(CSP),str(Time)))
self.con.commit()
except Exception as err:
print(err)
return item
- settings.py
BOT_NAME = "demo2"
SPIDER_MODULES = ["demo2.spiders"]
NEWSPIDER_MODULE = "demo2.spiders"
ROBOTSTXT_OBEY = True
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
BOT_NAME = "Bank"
SPIDER_MODULES = ["Bank.spiders"]
NEWSPIDER_MODULE = "Bank.spiders"
ITEM_PIPELINES = {
"demo2.pipelines.BankPipeline": 300,
}
运行结果:

心得体会
该爬虫用于从中国银行外汇牌价网站上获取外汇汇率数据。其中在parse方法中,爬虫通过XPath选择器解析响应,提取每一行的汇率数据,并创建一个名为BankItem的实例来保存提取的数据。最后,通过使用yield关键字返回book对象,将数据传递给Scrapy框架进行处理。如果有多个页面需要爬取,爬虫会继续请求下一页并调用parse方法进行解析。