作为一名刚学了几天的新手 在这里只使用了request 和 re 库
分析网页 https://movie.douban.com/top250?start=0&filter= 注意 start 每次下一页就会+25
正则 :观察网页源代码得到
电影名 titlepat = 'class="">.*?<span class="title">(.*?)</span>'
人数 countpat = '<span>(.*?)人评价</span>'
评分 gradepat = ' <span class="rating_num" property="v:average">(.*?)</span>'
短评 briefcommentpat = '<span class="inq">(.*?)</span>'
对于向文件的存取有很多不足之处 没有进行很深的了解 若有好的建议求大牛指教
以下是源码:
import requests
import re
def geturl(url):
headers = {'User-Agent':'Mozilla/50'}
r = requests.get(url,headers=headers)
r.encoding = r.apparent_encoding
return r.text
def getdata(url):
data = geturl(url)
titlepat = 'class="">.*?<span class="title">(.*?)</span>'
countpat = '<span>(.*?)人评价</span>'
gradepat = ' <span class="rating_num" property="v:average">(.*?)</span>'
briefcommentpat = '<span class="inq">(.*?)</span>'
titles = re.compile(titlepat,re.S).findall(data)
counts = re.compile(countpat).findall(data)
grades = re.compile(gradepat).findall(data)
briefcomments = re.compile(briefcommentpat).findall(data)
print(titles)
print(counts)
print(grades)
print(briefcomments)
f = open("豆瓣电影排名.txt", 'a')
for i in range(0,len(briefcomments)):
f.write('名称:'+titles[i]+'\t')
f.write('评分:'+grades[i] + '\t')
f.write('人数:'+counts[i] + '\n')
f.write(briefcomments[i] + '\n')
f.close()
if '__main__'==__name__:
page=0
while page<=225:
print(page)
url = "https://movie.douban.com/top250?start="+str(page)
getdata(url)
page+=25