作业一
-
1)用requests和BeautifulSoup库方法定向爬取给定网址的数据,屏幕打印爬取的大学排名信息。
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re
import pandas as pd
headers = {
"User-Agent":UserAgent().chrome
}
url = "http://www.shanghairanking.cn/rankings/bcur/2020"
class Top:
def __init__(self,headers,url):
self.headers = headers
self.url = url
def get_info(self,n):
"""_summary_
打印学校排名序列
Args:
n (_type_):n个学校排名
"""
#访问网站
req = requests.get(url=self.url,headers=self.headers)
#解决中文乱码
req.encoding = req.apparent_encoding
#将文本传输给bs4进行处理
soup = BeautifulSoup(req.text,"html.parser")
#寻找学校名称
schoolName = soup.find_all("a",attrs={"class":"name-cn"})
# print(schoolName)
#初始化学校名称列表,将学校名称放在列表中进行存储
school_Name_list = []
for name in schoolName:
if len(school_Name_list) == n:
break
school_Name_list.append(name.get_text(strip=True))
# print(schname)
#获取学校排名、省市、学校类型和评分
info_all = soup.find_all("td",attrs={"data-v-4645600d":"","class":""})
# print(info_all)
#初始化4个列表,取出排名、省市、学校类型、评分,放到不同的列表中
# 初始化4个列表
school_rank_list = []
school_city_list = []
school_type_list = []
school_count_list = []
for i in range(len(info_all)):
if len(school_count_list) == n:
break
if i % 5 == 0:
rank = info_all[i].get_text(strip=True)
school_rank_list.append(rank)
elif i % 5 == 1:
city = info_all[i].get_text(strip=True)
school_city_list.append(city)
elif i % 5 == 2:
type = info_all[i].get_text(strip=True)
school_type_list.append(type)
elif i % 5 == 3:
count = info_all[i].get_text(strip=True)
school_count_list.append(count)
# # 打印五个列表
# print(school_Name_list)
# print(school_rank_list)
# print(school_city_list)
# print(school_type_list)
# print(school_count_list)
df = pd.DataFrame()
df["排名"] = school_rank_list
df["学校名称"] = school_Name_list
df["省市"] = school_city_list
df["学校类型"] = school_type_list
df["总分"] = school_count_list
df.set_index("排名",inplace=True)
print(df)
if __name__ == "__main__":
top = Top(url=url,headers=headers)
top.get_info(14)
结果显示如下:

-
2)心得体会
对于这次爬取网站信息,可以掌握如何爬取静态页面的数据。也是一个小练习。
作业二
-
1)用requests和re库方法设计某个商城(自已选择)商品比价定向爬虫,爬取该商城,以关键词“书包”搜索页面的数据,爬取商品名称和价格。
对于本次作业,本文选择京东爬取书包的前60个商品名称和价格
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
# 定义请求头,模拟浏览器发送请求
headers = {
'User-Agent':UserAgent().chrome,
'Cookie':'shshshfpa=92c82d13-2121-7ccc-52f2-d51ab82c4052-1690361821; shshshfpx=92c82d13-2121-7ccc-52f2-d51ab82c4052-1690361821; __jdu=16903618203351137076469; areaId=16; PCSYCityID=CN_350000_350100_0; pinId=eMEp47mUR5nCqSQvMsdXNw; pin=%E9%98%B3%E5%AD%90%E5%AD%90%E5%AD%90%E5%AD%90; unick=%E9%98%B3%E5%AD%90%E5%AD%90%E5%AD%90%E5%AD%90cZc; _tp=u0kJyzagg7oUGSEhWLwup2V%2FvYQX9JhDwhsvF0jQci3ZneNj%2FXJyvA1ea0q%2FGIa1; _pst=%E9%98%B3%E5%AD%90%E5%AD%90%E5%AD%90%E5%AD%90; qrsc=3; ipLoc-djd=16-1303-1305-48927; TrackID=1MMSOxGpHwA_LU1V7ndv5N4fVWVTyMyMwqEW8jF17lyp2LhRIyCOHa9AuiH4rIojuwcNffv2-EerSk9ps3HgDWOH5ZIKK7kgPECB9YLkgk5UUqhvnY7pY-R7xDK8bAquR; unpl=JF8EALNnNSttX0pTDBwLSEURTl5QWw4KGR8KZjIMXVkKTF1RTwFMRxF7XlVdXxRKHx9sZxRUWFNPUA4aBisSEXteU11bD00VB2xXXAQDGhUQR09SWEBJJVlQXl4ITxcFZ2A1ZF5Ye1QEKwIcGhFJWlRXXglJFQdvZwdSXF5MUQITMhoiF3ttZFpcAUkSAF9mNVVtGh8IBx4LHxoYBl1TVlwKTBcKbGYHVllYS1YDGgQcFxdDbVVuXg; __jdv=76161171|haosou-search|t_262767352_haosousearch|cpc|5512151796_0_606868bf04254b3c988d885c68ee2fd0|1695279308964; mt_xid=V2_52007VwMVWlxaVVoWShheB2MDElBbWVRdGk4RbARuU0JUXlsHRkocH1QZYlNBVkELAV5IVU1aDW9WEVIJDVQNHnkaXQZlHxNaQVtSSx9MElgBbAMSYl1oUmofSxtUDWYKEFRfWGJeG0ob; joyya=1695279310.0.22.089bua8; __jdc=122270672; rkv=1.0; mba_muid=16903618203351137076469; wlfstk_smdl=xmmxhk7ane3j6idhh26fiq6mghtkwdyu; thor=6D9B9BE9BF4FBF426A2ADD7A1CCAF4FA5CB3D52533E8E110E0DECF34E31CDB30C5883CA0B234A6CFE67914A0CEAF1AACF370E053C3D0B4678772C9E854F0804BE4AEA58092751238BB7989CFE007B66A2B33FFD45CD9786EADE5F857DEFE95FEB133397F2C098E6BB8513CB03D011A36B57B948BB7E939E33956BB0673FEF034; flash=2_TDnbeSoNUWYk6aRDO0LYbz7JGkIC3Hs45AeD2yIhMsOdQq9HWlztIz6ObNdRCLgAMhPgQFz3zeOUYmKvhhjvX0nCDrHBbG4VGjMRTl8M7dN*; ceshi3.com=000; jsavif=0; jsavif=0; __jda=122270672.16903618203351137076469.1690361820.1695279304.1695286165.5; 3AB9D23F7A4B3CSS=jdd03JFNB3FD6VTTLETYHCCXR4X3RFJY2CMTRL5KRQYGCC6LXIEDZP4PFPASSWCV2HPHIAZS7R3TLL3RTKBK6YJVA5VSTZYAAAAMKW3W73IAAAAAADZL3FIABYA4LAIX; _gia_d=1; xapieid=jdd03JFNB3FD6VTTLETYHCCXR4X3RFJY2CMTRL5KRQYGCC6LXIEDZP4PFPASSWCV2HPHIAZS7R3TLL3RTKBK6YJVA5VSTZYAAAAMKW3W73IAAAAAADZL3FIABYA4LAIX; __jdb=122270672.2.16903618203351137076469|5.1695286165; shshshfpb=AAtwN7raKEsgtEyEhfMxS8tUauCxAUhaQNhghUwAAAAA; 3AB9D23F7A4B3C9B=JFNB3FD6VTTLETYHCCXR4X3RFJY2CMTRL5KRQYGCC6LXIEDZP4PFPASSWCV2HPHIAZS7R3TLL3RTKBK6YJVA5VSTZY'
}
# 发送请求
url = "https://search.jd.com/Search?keyword=书包&enc=utf-8"
response = requests.get(url, headers=headers)
# 判断是否请求成功
response.encoding = response.apparent_encoding
# 获取页面内容
page_content = response.text
# print(page_content)
# 使用 BeautifulSoup 解析页面内容
soup = BeautifulSoup(page_content, 'html.parser')
info_list = soup.find_all('li', class_='gl-item')[:60] # 限制为60项
for idx, info in enumerate(info_list):
name_tag = info.find('div', class_='p-name')
price_tag = info.find('div', class_='p-price')
if name_tag and price_tag:
name = name_tag.find('em')
price = price_tag.find('i')
if name and price:
print(f"{idx + 1}. 名字是: {name.get_text()}, 价格是:{price.get_text()}")
输出显示如下:

-
2)心得体会
对于本次实验,爬取书包的信息,有多个标签需要爬取,可以逐步逐步的进行定位
作业三
-
1)爬取一个给定网页或者自选网页的所有JPEG和JPG格式文件
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
headers = {
'User-Agent':UserAgent().chrome
}
response = requests.get(url,headers)
# print(response)
soup = BeautifulSoup(response.text,'lxml')
imgs = soup.find_all('img',attrs={'width':'600'})
# print(imgs)
img_address = []
for img in imgs:
picture_address = "https://xcb.fzu.edu.cn"+img['src']
img_address.append(picture_address)
# print(img_address)
for i in range(len(img_address)):
res = requests.get(img_address[i],headers)
with open(f"D:\study\大三上课程资料\数据采集与融合\pages_1\picture{i+1}.jpg","wb") as f:
f.write(res.content)
print(f"第{i+1}副图片下载完毕")
输出结果如下:


-
2)心得体会
对于本次作业的学习目标是学会爬取图片,如何存储图片。首先将存储图片的网址记录,随后访问图片地址,get获取图片信息,并下载到本地。