1、请用requests库的get()函数访问d: 360搜索主页(尾号7,8学号做)
python代码
import requests url="http://hao.360.com/" def gethtml(url): try: r=requests.get(url) r.raise_for_status() r.encoding="utf-8" print("text内容:",r.text) except: return "Error!" for i in range(20): print(gethtml(url))
运行结果

2、这是一个简单的html页面,请保持为字符串,完成后面的计算要求。d. 获取并打印html页面中的中文字符
python代码
from bs4 import BeautifulSoup import re soup=BeautifulSoup('''<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title>菜鸟教程(runoob.com)</title> </head> <body> <h1>我的第一个标题</h1> <p id="first">我的第一个段落。</p> </body> <table border="1"> <tr> <td>row 1, cell 1</td> <td>row 1, cell 2</td> </tr> <tr> <td>row 2, cell 1</td> <td>row 2, cell 2</td> </tr> </table> </html>''') print("head标签:\n",soup.head,"\n学号后两位:21") print("body标签:\n",soup.body) print("id为first的标签对象:\n",soup.find_all(id="first")) st=soup.text pp = re.findall(u'[\u1100-\uFFFDh]+?',st) print("d:html页面中的中文字符") print(pp)
运行结果
3、 爬中国大学排名网站内容(学号尾号7,8,爬取年份2018)
python代码
import requests import urllib.request from bs4 import BeautifulSoup import bs4 def getHTMLText(url): try: res = requests.get(url) res.raise_for_status() res.encoding = res.apparent_encoding return res.text except Exception as err: print(err) try: req = urllib.request.Request(url) data = urllib.request.urlopen(req).read.decode() return data except Exception as err: print(err) def fillUnivList(ulist, html): soup = BeautifulSoup(html, "html.parser") for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): a = tr('a') tds = tr('td') ulist.append([tds[0].text.strip(), a[0].string.strip(), tds[2].text.strip(), tds[3].text.strip(), tds[4].text.strip()]) def printUnivList(ulist1, num): tplt = "{0:^10}\t{1:^10}\t{2:^12}\t{3:^12}\t{4:^10}" print(tplt.format("排名", "学校名称", "省份", "学校类型", "总分")) for i in range(num): u = ulist1[i] print(tplt.format(u[0], u[1], u[2], u[3], u[4])) def main(): uinfo = [] url = "https://www.shanghairanking.cn/rankings/bcur/2018" html = getHTMLText(url) fillUnivList(uinfo, html) printUnivList(uinfo, 30) if __name__ == '__main__': main()
运行结果
存为csv文件
import urllib.request import pandas as pd from bs4 import BeautifulSoup url = "http://www.shanghairanking.cn/rankings/bcur/2018" if __name__ == "__main__": c = urllib.request.urlopen(url).read().decode("utf-8") soup = BeautifulSoup(c,'lxml') sumL, L, count = [["排名", "学校名称", "省市", "学校类型", "总分"]], [], 0 for i in soup.select("td"): if count == 5: count = 0 sumL.append(L.copy()) L.clear() continue elif count == 1: L.append(i.a.text.strip()) else: L.append(i.text.strip()) count += 1 data = pd.DataFrame(sumL).to_csv("./college.csv", index=False, header=False)