这次运用数据来存储,并从中提取数据
成果展示
部分成果1.png
部分成果2.png
代码
import requests
import time
from bs4 import BeautifulSoup
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
PhoneNum = client['PhoneNum']
url_list = PhoneNum['url_list']
phone_info = PhoneNum['phone_info']]
# spider 1
def get_phone_link(pages):
url = 'http://bj.58.com/shoujihao/pn{}/'.format(str(pages))
wb_data = requests.get(url)
time.sleep(1)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('li a.t')
numbers = soup.select('strong')
lk = soup.select('#infocont > span > b')
if '0' not in lk:
for number,link in zip(numbers,links):
data = {
'number': number.get_text(),
'PhoneLink' : link.get('href').split('?')[0]
}
print(data)
url_list.insert_one(data)
else:
pass
# spider 2
def get_item_info(url):
wb_data = requests.get(url)
time.sleep(1)
soup = BeautifulSoup(wb_data.text,'lxml')
titles = soup.select('div.col_sub.mainTitle > h1')
prices = soup.select('div.col_sub.sumary >ul > li > div.su_con > span')
for title, price in zip(titles,prices):
data = {
'title' :( title.get_text()).replace("\n","").replace(" ",""),
'price' : (price.get_text()).replace("\n","").replace(" ","")
}
print(data)
phone_info.insert_one(data)
for page in range(1, 200):
get_phone_link(page)
for info in url_list.find():
url = info["url"]
get_item_info(url)
总结
遇到需要采集大量的数据,最好的方式就是将功能分离开来。一次只执行一个动作