直接上代码吧
# -*- coding: utf-8 -*-
from elasticsearch import Elasticsearch
# 日志的配置环境
import platform
import logging.handlers
sys_platform = platform.system()
if sys_platform == "Windows":
LOG_FILE_check = './app_cic.txt'
else:
LOG_FILE_check = '/cic1.log'
handler = logging.handlers.RotatingFileHandler(LOG_FILE_check, maxBytes=1200 * 1024 * 1024,backupCount=10) # 实例化handler 200M 最多十个文件
fmt = '\n' + '%(message)s'
formatter = logging.Formatter(fmt) # 实例化formatter
handler.setFormatter(formatter) # 为handler添加formatter
logger = logging.getLogger('check') # 获取名为tst的logger
logger.addHandler(handler) # 为logger添加handler
logger.setLevel(logging.DEBUG)
# es = Elasticsearch()
es = Elasticsearch(["20.0.0.11:9200"], sniff_on_start=True, sniff_on_connection_fail=True,sniff_timeout=60)
import time
query_json = {
"query":{
"terms":{
"site":[
"百度搜索"
]
}
}
}
page_num = 1000 # 每次获取数据
query = es.search(index='guoyan_index_v1', body=query_json, scroll='5m', size=page_num)
results = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total'] # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果
every_num = int(total/page_num) #
# print(results)
print("total",total)
print("scroll_id",scroll_id)
print("every_num",every_num)
alist = []
end_data_list = []
print("----------",int(total/page_num)+1)
for i in range(0, every_num):
# for i in range(100, 1000):
print("正在读取的位置是:",i)
results_list = es.scroll(scroll_id=scroll_id, scroll='5m')['hits']['hits']
for key in results_list:
try:
source = key['_source']["source"]
other6 = key['_source']["other6"]
result_str = other6 + " " + source
end_data_list.append(result_str)
except:
pass
end_data_list = list(set(end_data_list))
print("去重以后的数据是条数是:",len(end_data_list))
for end_data in end_data_list:
logger.info(end_data)