2018年全国电影票房分析图
2018年即将过去,在这一年中,我们遇到了很多牛逼的电影,他们的数据怎么样呢?就看看一下数据吧。
影院数据为每日票房排名前10的影院,院线数据为每日票房排名前10的院线,城市数据为每日票房排名前10的城市,影片数据为每日票房排名前10的影片。故数据有很多缺失(爬取的目标网站就这么多数据,哈哈)
1,数据的抓取
使用Python+requests抓取数据,保存数据为csv,简单方便,代码如下
# encoding=utf-8
import requests
import time
import json
from datetime import datetime, timedelta
def download(date):
'''
download data
:param date:
:return:
'''
url = "https://zgdypw.cn/pors/w/webStatisticsDatas/api/{}/searchDayBoxOffice".format(date)
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "zgdypw.cn",
"Pragma": "no-cache",
"Referer": "https://zgdypw.cn/",
"User-Agent": "Mozilla/5.0(Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
proxies = {
"http": "****",
"https": "****",
}
response = requests.get(url, headers=headers, proxies=proxies)
response.encoding = "utf-8"
if response.status_code == 200:
data = json.loads(response.text)["data"]
# total
dayBoxOffice = data["dayBoxOffice"]
with open("dayBoxOffice.csv", "a+", encoding="utf-8") as file:
file.write("{0},{1},{2},{3},{4}\n".format(
dayBoxOffice["businessDay"],
dayBoxOffice["cinemaCount"],
dayBoxOffice["totalAudience"],
dayBoxOffice["totalBoxoffice"],
dayBoxOffice["totalSession"],
))
# CinemaChains
top10CinemaChains = data["top10CinemaChains"]
with open("top10CinemaChains.csv", "a+", encoding="utf-8") as file:
for item in top10CinemaChains:
file.write("{0},{1},{2},{3},{4},{5}\n".format(
date,
item["cinemaChainName"],
item["dayAudience"],
item["daySession"],
item["rank"],
item["totalSales"],
))
# Cinemas
top10Cinemas = data["top10Cinemas"]
with open("top10Cinemas.csv", "a+", encoding="utf-8") as file:
for item in top10Cinemas:
file.write("{0},{1},{2},{3},{4},{5}\n".format(
date,
item["cinemaName"],
item["dayAudience"],
item["daySession"],
item["rank"],
item["totalSales"],
))
# Citys
top10Citys = data["top10Citys"]
with open("top10Citys.csv", "a+", encoding="utf-8") as file:
for item in top10Citys:
file.write("{0},{1},{2},{3},{4},{5}\n".format(
date,
item["cityName"],
item["dayAudience"],
item["daySession"],
item["rank"],
item["totalSales"],
))
# Films
top10Films = data["top10Films"]
with open("top10Films.csv", "a+", encoding="utf-8") as file:
for item in top10Films:
file.write("{0},{1},{2},{3},{4},{5}\n".format(
date,
item["filmName"],
item["dayAudience"],
item["daySession"],
item["rank"],
item["daySales"],
item["filmTotalSales"],
))
pass
pass
if __name__ == "__main__":
start = datetime(year=2018, month=1, day=31)
end = datetime.now()
days = 0
tem = start + timedelta(days=days)
while tem <= end:
print(tem.strftime("%Y-%m-%d"))
download(tem.strftime("%Y-%m-%d"))
days += 1
tem = start + timedelta(days=days)
2,数据展示
1)全国票房数据
每日的票房曲线
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime,timedelta
import time
df = pd.read_csv("dayBoxOffice.csv")
df["date"] = pd.to_datetime(df["date"])
df.set_index("date",inplace=True)
# 每日的票房曲线图
plt.figure(figsize=(20,8),dpi=80)
data = df["totalBoxoffice"]
_x = data.index
_x = [row.strftime("%Y-%m-%d") for row in _x]
_y = data.values
plt.plot(range(len(_x)),_y)
interval = 7
last = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.strptime(_x[-1], "%Y-%m-%d")) + interval * 24 * 60 * 60))
plt.xticks(range(0,len(_x)+interval,interval),(_x+[last])[::interval],rotation=45)
plt.xlabel("日期")
plt.ylabel("票房(单位:万)")
plt.title("2018年电影每日票房图")
plt.grid()
plt.show()
显示的图片
2,每日的观影人次曲线图
# 每日的人次曲线图
plt.figure(figsize=(20,8),dpi=80)
data = df["totalAudience"]
_x = data.index
_x = [row.strftime("%Y-%m-%d") for row in _x]
_y = data.values.astype(int)/10000
plt.plot(range(len(_x)),_y)
interval = 7
last = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.strptime(_x[-1], "%Y-%m-%d")) + interval * 24 * 60 * 60))
plt.xticks(range(0,len(_x)+interval,interval),(_x+[last])[::interval],rotation=45)
plt.xlabel("日期")
plt.ylabel("人次(单位:万)")
plt.title("2018年电影每日观影人次图")
plt.grid()
plt.show()
从上图可以看出,节假日周六周日的观影人次确实比工作日高出很多。
3,城市票房占全国票房的百分比
等等,图标数据很多,详细数据可以看源码运行。
2018电影票房分析(numpy,pandas,matplotlib)
欢迎吐槽