2023数据采集与融合技术实践作业一

发布时间 2023-09-21 17:18:45作者: 陈子阳

代码放在gitee上:https://gitee.com/yangzizizi/crawl_projec.git

作业一

  • 1)用requests和BeautifulSoup库方法定向爬取给定网址的数据,屏幕打印爬取的大学排名信息。

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re
import pandas as pd

headers = {
    "User-Agent":UserAgent().chrome
}
url = "http://www.shanghairanking.cn/rankings/bcur/2020"

class Top:

    def __init__(self,headers,url):
        self.headers = headers
        self.url = url

    def get_info(self,n):
        """_summary_
        打印学校排名序列
        Args:
            n (_type_):n个学校排名
        """        
        #访问网站
        req = requests.get(url=self.url,headers=self.headers)
        #解决中文乱码
        req.encoding = req.apparent_encoding
        #将文本传输给bs4进行处理
        soup = BeautifulSoup(req.text,"html.parser")
        #寻找学校名称
        schoolName = soup.find_all("a",attrs={"class":"name-cn"})
        # print(schoolName)
        #初始化学校名称列表,将学校名称放在列表中进行存储
        school_Name_list = []
        for name in schoolName:
            if len(school_Name_list) == n:
                break
            school_Name_list.append(name.get_text(strip=True))
            # print(schname)
        
        #获取学校排名、省市、学校类型和评分
        info_all = soup.find_all("td",attrs={"data-v-4645600d":"","class":""})
        # print(info_all)
        #初始化4个列表,取出排名、省市、学校类型、评分,放到不同的列表中
        # 初始化4个列表
        school_rank_list = []
        school_city_list = []
        school_type_list = []
        school_count_list = []

        for i in range(len(info_all)):
            if len(school_count_list) == n:
                break
                
            if i % 5 == 0:
                rank = info_all[i].get_text(strip=True)
                school_rank_list.append(rank)
            elif i % 5 == 1:
                city = info_all[i].get_text(strip=True)
                school_city_list.append(city)
            elif i % 5 == 2:
                type = info_all[i].get_text(strip=True)
                school_type_list.append(type)
            elif i % 5 == 3:
                count = info_all[i].get_text(strip=True)
                school_count_list.append(count)

        # # 打印五个列表
        # print(school_Name_list)
        # print(school_rank_list)
        # print(school_city_list)
        # print(school_type_list)
        # print(school_count_list)
        df = pd.DataFrame()
        df["排名"] = school_rank_list
        df["学校名称"] = school_Name_list
        df["省市"] = school_city_list
        df["学校类型"] = school_type_list
        df["总分"] = school_count_list
        df.set_index("排名",inplace=True)
        print(df)



  if __name__ == "__main__":
    top = Top(url=url,headers=headers)
    top.get_info(14)

结果显示如下:

  • 2)心得体会

对于这次爬取网站信息,可以掌握如何爬取静态页面的数据。也是一个小练习。

作业二

  • 1)用requests和re库方法设计某个商城(自已选择)商品比价定向爬虫,爬取该商城,以关键词“书包”搜索页面的数据,爬取商品名称和价格。

对于本次作业,本文选择京东爬取书包的前60个商品名称和价格

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

# 定义请求头,模拟浏览器发送请求
headers = {
    'User-Agent':UserAgent().chrome,
    'Cookie':'shshshfpa=92c82d13-2121-7ccc-52f2-d51ab82c4052-1690361821; shshshfpx=92c82d13-2121-7ccc-52f2-d51ab82c4052-1690361821; __jdu=16903618203351137076469; areaId=16; PCSYCityID=CN_350000_350100_0; pinId=eMEp47mUR5nCqSQvMsdXNw; pin=%E9%98%B3%E5%AD%90%E5%AD%90%E5%AD%90%E5%AD%90; unick=%E9%98%B3%E5%AD%90%E5%AD%90%E5%AD%90%E5%AD%90cZc; _tp=u0kJyzagg7oUGSEhWLwup2V%2FvYQX9JhDwhsvF0jQci3ZneNj%2FXJyvA1ea0q%2FGIa1; _pst=%E9%98%B3%E5%AD%90%E5%AD%90%E5%AD%90%E5%AD%90; qrsc=3; ipLoc-djd=16-1303-1305-48927; TrackID=1MMSOxGpHwA_LU1V7ndv5N4fVWVTyMyMwqEW8jF17lyp2LhRIyCOHa9AuiH4rIojuwcNffv2-EerSk9ps3HgDWOH5ZIKK7kgPECB9YLkgk5UUqhvnY7pY-R7xDK8bAquR; unpl=JF8EALNnNSttX0pTDBwLSEURTl5QWw4KGR8KZjIMXVkKTF1RTwFMRxF7XlVdXxRKHx9sZxRUWFNPUA4aBisSEXteU11bD00VB2xXXAQDGhUQR09SWEBJJVlQXl4ITxcFZ2A1ZF5Ye1QEKwIcGhFJWlRXXglJFQdvZwdSXF5MUQITMhoiF3ttZFpcAUkSAF9mNVVtGh8IBx4LHxoYBl1TVlwKTBcKbGYHVllYS1YDGgQcFxdDbVVuXg; __jdv=76161171|haosou-search|t_262767352_haosousearch|cpc|5512151796_0_606868bf04254b3c988d885c68ee2fd0|1695279308964; mt_xid=V2_52007VwMVWlxaVVoWShheB2MDElBbWVRdGk4RbARuU0JUXlsHRkocH1QZYlNBVkELAV5IVU1aDW9WEVIJDVQNHnkaXQZlHxNaQVtSSx9MElgBbAMSYl1oUmofSxtUDWYKEFRfWGJeG0ob; joyya=1695279310.0.22.089bua8; __jdc=122270672; rkv=1.0; mba_muid=16903618203351137076469; wlfstk_smdl=xmmxhk7ane3j6idhh26fiq6mghtkwdyu; thor=6D9B9BE9BF4FBF426A2ADD7A1CCAF4FA5CB3D52533E8E110E0DECF34E31CDB30C5883CA0B234A6CFE67914A0CEAF1AACF370E053C3D0B4678772C9E854F0804BE4AEA58092751238BB7989CFE007B66A2B33FFD45CD9786EADE5F857DEFE95FEB133397F2C098E6BB8513CB03D011A36B57B948BB7E939E33956BB0673FEF034; flash=2_TDnbeSoNUWYk6aRDO0LYbz7JGkIC3Hs45AeD2yIhMsOdQq9HWlztIz6ObNdRCLgAMhPgQFz3zeOUYmKvhhjvX0nCDrHBbG4VGjMRTl8M7dN*; ceshi3.com=000; jsavif=0; jsavif=0; __jda=122270672.16903618203351137076469.1690361820.1695279304.1695286165.5; 3AB9D23F7A4B3CSS=jdd03JFNB3FD6VTTLETYHCCXR4X3RFJY2CMTRL5KRQYGCC6LXIEDZP4PFPASSWCV2HPHIAZS7R3TLL3RTKBK6YJVA5VSTZYAAAAMKW3W73IAAAAAADZL3FIABYA4LAIX; _gia_d=1; xapieid=jdd03JFNB3FD6VTTLETYHCCXR4X3RFJY2CMTRL5KRQYGCC6LXIEDZP4PFPASSWCV2HPHIAZS7R3TLL3RTKBK6YJVA5VSTZYAAAAMKW3W73IAAAAAADZL3FIABYA4LAIX; __jdb=122270672.2.16903618203351137076469|5.1695286165; shshshfpb=AAtwN7raKEsgtEyEhfMxS8tUauCxAUhaQNhghUwAAAAA; 3AB9D23F7A4B3C9B=JFNB3FD6VTTLETYHCCXR4X3RFJY2CMTRL5KRQYGCC6LXIEDZP4PFPASSWCV2HPHIAZS7R3TLL3RTKBK6YJVA5VSTZY'
}

# 发送请求
url = "https://search.jd.com/Search?keyword=书包&enc=utf-8"
response = requests.get(url, headers=headers)

# 判断是否请求成功

response.encoding = response.apparent_encoding
# 获取页面内容
page_content = response.text
# print(page_content)
# 使用 BeautifulSoup 解析页面内容
soup = BeautifulSoup(page_content, 'html.parser')

info_list = soup.find_all('li', class_='gl-item')[:60]  # 限制为60项

for idx, info in enumerate(info_list):
    name_tag = info.find('div', class_='p-name')
    price_tag = info.find('div', class_='p-price')

    if name_tag and price_tag:
        name = name_tag.find('em')
        price = price_tag.find('i')

        if name and price:
            print(f"{idx + 1}. 名字是: {name.get_text()}, 价格是:{price.get_text()}")

输出显示如下:

  • 2)心得体会

对于本次实验,爬取书包的信息,有多个标签需要爬取,可以逐步逐步的进行定位

作业三

  • 1)爬取一个给定网页或者自选网页的所有JPEG和JPG格式文件

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

url = "https://xcb.fzu.edu.cn/info/1071/4481.htm"
headers = {
    'User-Agent':UserAgent().chrome
}

response = requests.get(url,headers)
# print(response)
soup = BeautifulSoup(response.text,'lxml')

imgs = soup.find_all('img',attrs={'width':'600'})
# print(imgs)
img_address = []
for img in imgs:
    picture_address = "https://xcb.fzu.edu.cn"+img['src']
    img_address.append(picture_address)
# print(img_address)

for i in range(len(img_address)):
    res = requests.get(img_address[i],headers)
    with open(f"D:\study\大三上课程资料\数据采集与融合\pages_1\picture{i+1}.jpg","wb") as f:
        f.write(res.content)
        print(f"第{i+1}副图片下载完毕")

输出结果如下:

  • 2)心得体会

对于本次作业的学习目标是学会爬取图片,如何存储图片。首先将存储图片的网址记录,随后访问图片地址,get获取图片信息,并下载到本地。