爬虫作业

发布时间 2023-12-10 15:01:47作者: ynswdfq

1、请用requests库的get()函数访问d: 360搜索主页(尾号7,8学号做)

python代码

import requests
url="http://hao.360.com/"
def gethtml(url):
    try:
        r=requests.get(url)
        r.raise_for_status()
        r.encoding="utf-8"
        print("text内容:",r.text)
    except:
        return "Error!"
for i in range(20):
    print(gethtml(url))

运行结果

 

2、这是一个简单的html页面,请保持为字符串,完成后面的计算要求。d. 获取并打印html页面中的中文字符

python代码

from bs4 import BeautifulSoup
import re
soup=BeautifulSoup('''<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p>
</body>
<table border="1">
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr>
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
</html>''')
print("head标签:\n",soup.head,"\n学号后两位:21")
print("body标签:\n",soup.body)
print("id为first的标签对象:\n",soup.find_all(id="first"))
st=soup.text
pp = re.findall(u'[\u1100-\uFFFDh]+?',st)
print("d:html页面中的中文字符")
print(pp)

运行结果

 

3、 爬中国大学排名网站内容(学号尾号7,8,爬取年份2018)

python代码

import requests 
import urllib.request  
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):  
    try:
        res = requests.get(url)  
        res.raise_for_status()  
        res.encoding = res.apparent_encoding
        return res.text
    except Exception as err:
        print(err)
    try:
        req = urllib.request.Request(url)
        data = urllib.request.urlopen(req).read.decode()
        return data
    except Exception as err:
        print(err)
def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag): 
            a = tr('a') 
            tds = tr('td') 
            ulist.append([tds[0].text.strip(), a[0].string.strip(), tds[2].text.strip(),
                          tds[3].text.strip(), tds[4].text.strip()])
def printUnivList(ulist1, num): 
    tplt = "{0:^10}\t{1:^10}\t{2:^12}\t{3:^12}\t{4:^10}"
    print(tplt.format("排名", "学校名称", "省份", "学校类型", "总分"))
    for i in range(num):
        u = ulist1[i]
        print(tplt.format(u[0], u[1], u[2], u[3], u[4]))
def main():
    uinfo = [] 
    url = "https://www.shanghairanking.cn/rankings/bcur/2018"
    html = getHTMLText(url)
    fillUnivList(uinfo, html)
    printUnivList(uinfo, 30) 
if __name__ == '__main__':
    main()

运行结果

存为csv文件

 
import urllib.request
import pandas as pd
from bs4 import BeautifulSoup
 
url = "http://www.shanghairanking.cn/rankings/bcur/2018"
if __name__ == "__main__":
    c = urllib.request.urlopen(url).read().decode("utf-8")
    soup = BeautifulSoup(c,'lxml')
    sumL, L, count = [["排名", "学校名称", "省市", "学校类型", "总分"]], [], 0
    for i in soup.select("td"):
        if count == 5:
            count = 0
            sumL.append(L.copy())
            L.clear()
            continue
        elif count == 1:
            L.append(i.a.text.strip())
        else:
            L.append(i.text.strip())
        count += 1
    data = pd.DataFrame(sumL).to_csv("./college.csv", index=False, header=False)