b站详情视频下载
经过测试,理论上所有视频都可以下载,付费的视频需要登录之后,带上cookie信息,就可以下载了,
1、环境
1.1Python 运行环境
1.2FFmpeg 合并视频和音频
2、代码
清单文件
requests==2.21.0
lxml==4.3.0
from lxml import etree
import requests
import subprocess
import json
import re
import os
class BiBiSpider:
def __init__(self):
self.headers = {
'Referer': 'https://www.bilibili.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
"cookie": "_uuid=B75117CA-056D-136C-81C4-27DB1D06377A30553infoc; buvid3=3E3696DA-F5F3-4F55-A6AB-F0428459B3D9155836infoc; CURRENT_FNVAL=16; LIVE_BUVID=AUTO6415809639729886; rpdid=|(k|~u|YY)R)0J'ul)|mYJm)~; CURRENT_QUALITY=80; _ga=GA1.2.1444362150.1589713917; bsource=seo_baidu; sid=jceqdrok; PVID=1; DedeUserID=318464066; DedeUserID__ckMd5=2d1368bb369e1d79; SESSDATA=6c514b9f%2C1605405848%2C76c7d*51; bili_jct=208edda9bf1d5248ec6035c44959ec0e",
}
self.video_dirs = 'video'
def download_file(self, file_path, download_url):
print('*' * 100)
print(f"保存路径:{file_path}")
print(f'下载URL:{download_url}')
response = requests.get(url=download_url, headers=self.headers, stream=True)
content_size = int(response.headers["content-length"]) # 视频内容的总大小
size = 0
with open(file_path, "wb") as file: # 非纯文本都以字节的方式写入
for data in response.iter_content(chunk_size=1024): # 循环写入
file.write(data) # 写入视频文件
file.flush() # 刷新缓存
size += len(data) # 叠加每次写入的大小
# 打印下载进度
print("\r文件下载进度:%d%%(%0.2fMB/%0.2fMB)" % (
float(size / content_size * 100), (size / 1024 / 1024),
(content_size / 1024 / 1024)),
end=" ")
print()
def get_response(self, url):
response = None
try:
response = requests.get(url, headers=self.headers)
except Exception as e:
print(e)
return response
def parse_detail(self, url):
response = self.get_response(url)
if not response:
return
html = response.text
document = etree.HTML(html)
title = ''.join(document.xpath('//*[@class="video-title"]/@title'))
if not title:
title = ''.join(document.xpath('//*[@class="media-wrapper"]/h1/@title'))
title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "-", title)
pattern = r'\<script\>window\.__playinfo__=(.*?)\</script\>'
result = re.findall(pattern, html)
if len(result) < 1:
return None
result = result[0]
data = json.loads(result)
durl_list = data['data'].get('durl')
if durl_list:
for durl in durl_list:
url = durl['url']
file_path = 'video/1.mp4'
self.download_file(file_path, url)
else:
merge_path = f'{self.video_dirs}/{title}.mp4'
video_url = data['data']['dash']['video'][0]['baseUrl']
video_path = f'{self.video_dirs}/{title}.m4s'
self.download_file(video_path, video_url)
audio_url = data['data']['dash']['audio'][0]['baseUrl']
audio_path = f'{self.video_dirs}/{title}.mp3'
self.download_file(audio_path, audio_url)
self.merge_video_and_audio(video_path, audio_path, merge_path)
def merge_video_and_audio(self, video_path, audio_path, merge_path):
cmd = f'ffmpeg -i {video_path} -i {audio_path} -vcodec copy -acodec copy {merge_path}'
subprocess.call(cmd, shell=True)
print(merge_path, '合并完成')
os.remove(video_path)
os.remove(audio_path)
def start_requests(self):
url = 'https://www.bilibili.com/bangumi/play/ep306470'
self.parse_detail(url)
def run(self):
self.start_requests()
if __name__ == '__main__':
BiBiSpider().run()