python爬虫——对爱奇艺热播榜和必看榜进行可视化和分析 -JZTXT

（一）选题的背景

　　随着社会和科学技术的发展，人们的娱乐生活越发丰富，特别是电影、电视、游戏。电影是一种表演艺术、视觉艺术及听觉艺术，利用胶卷、录像带或数字媒体将影像和声音捕捉起来，再加上后期的编辑工作而成。电影是人类知道其确切产生时间和成长历程的艺术，是20世纪以来发展迅速、影响巨大的媒体，是政治、经济、文化三位一体的创意产业。它能准确地“还原”现实世界，“展现”虚拟世界，给人以逼真感。我将爬取的是爱奇艺电影热播榜和必看榜，了解最近人们都喜欢看那些电影。

（二）大数据分析设计方案

爬取爱奇艺必看榜和热播榜的名字、排名、实时热播和最高热波，再通过数据可视化、数据模型分析、分布图做数据分析，了解人们最近感兴趣和热度最高的电影。

（三）数据分析步骤

数据集来源：

热播榜：https://www.iqiyi.com/ranks1/1/0

必看榜：https://www.iqiyi.com/ranks1/1/-6

按F12可查看网页源代码，找到名字、排名、热度

排名

名字

热度

爬取网页数据

#导入库

import requests

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import matplotlib

import csv

import scipy as sp

import seaborn as sns

from sklearn.linear_model import LinearRegression

from bs4 import BeautifulSoup

from pandas import DataFrame

from scipy.optimize import leastsq

#填入要请求的服务器地址

url="https://www.iqiyi.com/ranks1/1/0"#爱奇艺热播榜电影榜单

#requests抓取网页信息

def getHTMLText(url,timeout=30):

try:

r=requests.get(url,timeout=30) #用requests抓取网页信息

r.raise_for_status() #异常捕捉

r.encoding=r.apparent_encoding

return r.text

except:

return'产生异常'

#html.parser表示用BeautifulSoup库解析网页

html=getHTMLText(url)

soup=BeautifulSoup(html,'html.parser')

print(soup.prettify())

#按照标准缩进格式的结构输出

r=requests.get(url)

#请求网络

r.encoding=r.apparent_encoding

#统一编码

html = r.text

soup = BeautifulSoup(html,'lxml')

#type(soup)

print(soup.prettify())

#基于bs4库HTML的格式输出，让页面更友好的显示

#获取标题

title1=[]

for i in soup.find_all('div',

class_='rvi__tit1'):

title1.append(i.get_text().strip())

title1

通过这个可以看出在爱奇艺热播电影榜中前 25名

#获取热度

b1=[]

for i in soup.find_all('span',

class_='rvi__index__num'):

b1.append(i.get_text().strip())

#整合获取内容

print('{:^50}'.format('爱奇艺热播榜'))

print('{:^5}\t{:^40}\t{:^10}'.format('排名', '名称', '实时热度'))

for i in range(25):

print('{:^5}\t{:^40}\t{:^10}'.format(i+1, title1[i], b1[i]))

lit1=[]

for i in range(25):

lit1.append([i+1, title1[i], b1[i]])

df1 = pd.DataFrame(lit1,columns=['排名', '名称', '实时热度'])

通过这个可以看出在爱奇艺热播电影榜中前 25名一一对应的实时热度。

必看榜：

#爱奇艺必看榜电影榜单

url="https://www.iqiyi.com/ranks1/1/-6"

def getHTMLText(url,timeout=30):

try:

r=requests.get(url,timeout=30) #用requests抓取网页信息

r.raise_for_status() #异常捕捉

r.encoding=r.apparent_encoding

return r.text

except:

return'产生异常'

html=getHTMLText(url)

soup=BeautifulSoup(html,'html.parser')

print(soup.prettify())

r=requests.get(url)

r.encoding=r.apparent_encoding

html = r.text

soup = BeautifulSoup(html,'lxml')

print(soup.prettify())

title2=[]

for i in soup.find_all('div', class_='rvi__tit1'):

title2.append(i.get_text().strip())

title2

b2=[]

for i in soup.find_all('span',

class_='rvi__index__num'):

b2.append(i.get_text().strip())

print('{:^50}'.format('爱奇艺必看榜'))

print('{:^5}\t{:^40}\t{:^10}'.format('排名', '名称', '最高热度'))

for i in range(25):

print('{:^5}\t{:^40}\t{:^10}'.format(i+1, title2[i], b2[i]))

lit2=[]

for i in range(25):

lit2.append([i+1, title2[i], b2[i]])

df2 = pd.DataFrame(lit2,columns=['排名', '名称', '最高热度'])

通过这个可以看出在爱奇艺必看电影榜中前 25名，其中前10中有6部是讲国家的，向我们传播爱国，让我们了解我们先辈们的艰辛，更加了解历史，勿忘历史。

#对获取的数据进行持久化存储

df1= pd.DataFrame(lit1,columns=['排名','名称','实时热度'])

df1.to_excel('热播榜.xlsx',index=False)

df2= pd.DataFrame(lit2,columns=['排名','名称','最高热度'])

df2.to_excel('必看榜.xlsx',index=False

#读取热播榜excel文件

rank1=pd.DataFrame(pd.read_excel('热播榜.xlsx'))

print(rank1)

#读取必看榜excel文件

rank2=pd.DataFrame(pd.read_excel('必看榜.xlsx'))

print(rank2)

#数据清洗

#1.删除无效列

rank1.drop('名称',axis=1,inplace=True)

print(rank1)

rank2.drop('名称',axis=1,inplace=True)

print(rank2)

#2.检查是否有重复值

print(rank1.duplicated())

print(rank2.duplicated())

#3.检查是否有空值

print(rank1['实时热度'].isnull().value_counts())

print(rank2['最高热度'].isnull().value_counts())

#4.异常值处理

print(rank1.describe())

print(rank2.describe())

#数据分析

from sklearn.linear_model import LinearRegression

X = df1.drop("名称",axis=1)

predict_model = LinearRegression()

predict_model.fit(X,df1['实时热度'])

print("回归系数为:",predict_model.coef_)

X = df2.drop("名称",axis=1)

predict_model = LinearRegression()

predict_model.fit(X,df2['最高热度'])

print("回归系数为:",predict_model.coef_)

#绘制排名与热度的回归图

import seaborn as sns

plt.rcParams['font.sans-serif'] = ['SimHei']

plt.rcParams['axes.unicode_minus'] = False

sns.regplot(rank1.排名,rank1.实时热度,label="热播榜")

sns.regplot(rank2.排名,rank2.最高热度,label="必看榜")

由此看出热播榜的作品和必看榜的作品热度之间相差甚大，唯有一部作品接近却没有超过的。

#散点图 kind='reg'

sns.jointplot(x="排名",y="实时热度",

data=rank1,kind='reg',color='Orchid')

sns.jointplot(x="排名",y="最高热度",

data=rank2,kind='reg',color='b')

图1可以看出有一部作品的热度远超其他作品，图2可以看出热度高的作品还是有很多的。

fig,axes=plt.subplots(2,2)

# 1.默认热播榜的绘图效果

sns.distplot( rank1['实时热度'], ax=axes[0][0])

#2. 默认必看榜的绘图效果

sns.distplot( rank2['最高热度'], ax=axes[0][1])

#4.通过调节vertical，hist_kws和kde_kws参数，改变直方图的方向、坐标轴和密度曲线颜色

sns.distplot( rank1['实时热度'], vertical=True, hist_kws={'color':'black','label':'hist'}, kde_kws={'color':'Orchid','label':'KDE'}, ax=axes[1][0])

#4.通过调节vertical，hist_kws和kde_kws参数，改变直方图的方向、坐标轴和密度曲线颜色

sns.distplot( rank2['最高热度'], vertical=True, hist_kws={'color':'black','label':'hist'}, kde_kws={'color':'b','label':'KDE'}, ax=axes[1][1])

图1、3可以看出热度在2500的作品较多并且在6000处有空缺，说明没有热播榜作品的热度在6000左右。

图2、4可以看出热度在7500的作品较多。

# kind='hex'

sns.jointplot(x="排名",y="实时热度",

data=rank1,kind='hex',color='Orchid')

sns.jointplot(x="排名",y="最高热度",

data=rank2,kind='hex',color='b')

#选择排名和最高热度两个特征变量，绘制分布图，用最小二乘法分析两个变量间的二次拟合方程和拟合曲线

colnames1=["排名","名称","实时热度"]

df1 = pd.read_excel('C:/Users/30787/热播榜.xlsx',skiprows=1,names=colnames1)

colnames2=["排名","名称","最高热度"]

df2 = pd.read_excel('C:\\Users\\30787\\必看榜.xlsx',skiprows=1,names=colnames2)

X1 = df1.排名

Y1 = df1.实时热度

Z1 = df1.名称

X2 = df2.排名

Y2 = df2.最高热度

Z2 = df2.名称

#最高热度与排名关系图

matplotlib.rcParams['font.sans-serif']=['SimHei']

plt.plot(X1,Y1,color="g", linewidth=2,label="热播榜")

plt.plot(X2,Y2,color="r", linewidth=2,label="必看榜")

plt.legend(loc=1)

plt.xlabel("排名")

plt.ylabel("热度")

plt.title('热度与排名关系图')

plt.show()

可以看出必看榜热度分布比较集中，热播榜起伏较大。

#最高热度与排名柱状关系图

plt.bar(X2,Y2,color="Orchid", linewidth=2,label="热播榜")

plt.bar(X1,Y1,color="b", linewidth=2,label="必看榜")

plt.legend(loc=1)

plt.xlabel("排名")

plt.ylabel("热度")

plt.title('热度与排名柱状关系图')

plt.show()

#画出散点图

def A():

plt.scatter(X1,Y1,color="Orchid"

,linewidth=2,label='热播榜')

plt.scatter(X2,Y2,color="b"

,linewidth=2,label='必看榜')

plt.legend(loc=1)

plt.title("散点图",color="blue")

plt.grid()

plt.xlabel("排名")

plt.ylabel("热度")

plt.show()

def func(p,x):

a,b,c=p

return a*x*x+b*x+c

def error(p,x,y):

return func(p,x)-y

print(A())

可以看出必看榜热度分布比较集中，热播榜起伏较大。

def main1():

plt.figure(figsize=(10,6))

p0=[0,0,0]

Para = leastsq(error,p0,args=(X1,Y1))

a,b,c=Para[0]

print("a=",a,"b=",b,"c=",c)

plt.scatter(X1,Y1,color="Orchid",

linewidth=2,label='热播榜热度值分布')

plt.legend(loc=1)

x1=np.linspace(0,20,20)

y1=a*x1*x1+b*x1+c

plt.plot(x1,y1,color="red",

linewidth=2,label="热播榜拟合曲线")

plt.legend(loc=1)

plt.title("热播榜一元二次方程关系图")

plt.xlabel("排名")

plt.ylabel("实时热度")

plt.grid()

plt.show()

print(main1())

由此图看出，热播榜的热度主要在一元二次方程的右上方。

def main2():

plt.figure(figsize=(10,6))

p0=[0,0,0]

Para = leastsq(error,p0,args=(X2,Y2))

a,b,c=Para[0]

print("a=",a,"b=",b,"c=",c)

plt.scatter(X2,Y2,color="b",

linewidth=2,label='必看榜热度值分布')

plt.legend(loc=1)

x2=np.linspace(0,20,20)

y2=a*x2*x2+b*x2+c

plt.plot(x2,y2,color="red",

linewidth=2,label="必看榜拟合曲线")

plt.legend(loc=1)

plt.title("必看榜一元二次方程关系图")

plt.xlabel("排名")

plt.ylabel("最高热度")

plt.grid()

plt.show()

print(main2())

由此图看出，点的分布有稀疏到密集，说明排名越高作品的热度越高而且热度增长的幅度越大。

四、全部代码

（五）总结

　　1.通过对热播榜和必看榜的数据可视化和数据分析，可以看出必看榜前25的热度都在7000以上，并且前10中有70%在8000以上，而且有50%都是关于国家关于战争，足以看出人们对和平的渴望和对战争的厌恶，这一些电影不仅是让我们更加了解历史，勿忘历史，缅怀先烈，珍惜和平！。热播榜的热度大多在3000—4000，由此看出热播榜的热度明显低于必看榜。在可视化中可以看出热播榜的实时热度集中在3500，必看榜的最高热度主要集中在7500。

　　2.通过此次python爬虫的数据可视化和数据分析，我巩固了以前不是很了解的函数，也学会了如何去下载模块（pip install +模块），也让我学会了如何去分析网站，对python有了更深层次的了解，提高了我的执行能力和自主学习的能力。