python的css选择器实例,你都用python做什么?
在律所里分析判决书!相当有用!
上代码:
import docx, csv, docxpyimport refrom get_file_path import all_file_listwith open('process_book' + '.csv', 'a+', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([ '案号', '法院名称', '原审地' '再审申请人1', '再审申请人1住址', '再审申请人2', '再审申请人2住址', '再审申请人3', '再审申请人3住址', '再审申请人一审地位', '再审申请人二审地位', '被申请人1住址', '被申请人2住址', '被申请人3住址', '被申请人的类型,0自然人,1公司法人,2机关法人', '再审申请人是否有律师', '被申请人是否有律师', '合议庭人数', '审判长姓名', '审判员1', '审判员2' ])for read_docx in all_file_list: file = docx.Document(read_docx) print('------docx已经顺利读出文件,开始处理中------') file2 = docxpy.process(read_docx) print('------docxpy已经顺利读出文件,开始处理中------') # 案号 try: number = file.paragraphs[7].textexcept Exception as e: number = '出错' print(number) ''' 找案件地域,注意最高院的情况 ''' # 提取再审法院名称 court_name = re.findall(r'审理法院: (.*)', file2)[0] # 找不服哪级法院判决,然后丢出去 case_district = re.findall(r'(.*)不服(.*)人民法院', file2) for district in case_district: case_district = re.findall(r'[\u4e00-\u9fa5]{3}', district[1])[0] ''' 这里开始找再审申请人的信息 ''' appellant_name_list = [] appellant_name_0 = appellant_name_1 = appellant_name_2 = appellant_name_3 = '' instance_appellant_name = re.findall(r'再审申请人(.*):(.*),', file2) if instance_appellant_name: for name in instance_appellant_name: origin_list = name.split(',') if '男' in origin_list: appellant_name_list.append(origin_list[0]) elif '女' in origin_list: appellant_name_list.append(origin_list[0]) else: appellant_name_list.append(origin_list[0]) for get_appellant_name in range(len(appellant_name_list)): locals()['appellant_name_' + str(get_appellant_name)] = appellant_name_list[get_appellant_name] instance_appellant_name = re.findall(r'再审申请人(.*):(.*)。住所', file2) if len(appellant_name_0) == 0: # 稍微变态模式 for name in instance_appellant_name: appellant_name_list.append(name) for get_appellant_name in range(len(appellant_name_list)): locals()['appellant_name_' + str(get_appellant_name)] = appellant_name_list[get_appellant_name] if not appellant_name_list: # 超级变态 instance_appellant_name = re.findall(r'再审申请人(.*):(.*)。\n', file2) for name in instance_appellant_name: appellant_name_list.append(name) for get_appellant_name in range(len(appellant_name_list)): locals()['appellant_name_' + str(get_appellant_name)] = appellant_name_list[get_appellant_name] # 判断再审申请人是否有律师: appellant_have_lawyer = 0 appellant_have_lawyer_result = re.findall(r'再审申请人([\s\S]*委托诉讼代理人:.*', file2) for lawyer_is in appellant_have_lawyer_result: if '律师' in lawyer_is: appellant_have_lawyer = 1 else: appellant_have_lawyer = 0 # 判断被申请人是否有律师: retrial_respondent_have_lawyer = 0 retrial_respondent_lawyer_result = re.findall(r'被申请人([\s\S]*委托诉讼代理人:.*', file2) for lawyer_is in retrial_respondent_lawyer_result: if '律师' in lawyer_is: retrial_respondent_have_lawyer = 1 else: retrial_respondent_have_lawyer = 0 # 定义空字符串用于接收循环体返回内容 # 审判长 chief_judge = '' # 审判员列表 judge_list = [] # 审判员 judge_officer_0 = judge_officer_1 = '' # 合议庭总人数 collegiate_bench_num = None # 找合议庭信息 for read_docx in file.paragraphs: ''' 这里先找合议庭成员的信息 ''' chief_judge_all = re.findall(r'审判长 (.+)', read_docx.text) for k in chief_judge_all: chief_judge = k.replace(' ', '') # 取到审判长名字 judge_officer_all = re.findall(r'审判员 (.+)', read_docx.text) for k in judge_officer_all: judge_list.append(k.replace(' ', '')) # 取到所有审判员列表 # 计算合议庭总人数 collegiate_bench_num = 1 + len(judge_list) # 为审判员动态赋值 for get_judge_officer in range(len(judge_list)): locals()['judge_officer_' + str(get_judge_officer)] = judge_list[get_judge_officer] # 再审申请人的住址 appellant_address_list = [] appellant_address_0 = appellant_address_1 = appellant_address_2 = appellant_address_3 = '' appellant_address_result = re.findall(r'再审申请人(.*):.*住所地(.*)。', file2) # 法人住址,'住所地三字全' if appellant_address_result: for addr in appellant_address_result: appellant_address_list.append(addr.strip(':')) for detail_addr in range(len(appellant_address_list)): locals()['appellant_address_' + str(detail_addr)] = appellant_address_list[detail_addr] else: appellant_address_result = re.findall(r'再审申请人(.*):.*住(.+)。', file2) # 自然人住址,'住'开始 for addr in appellant_address_result: appellant_address_list.append(addr.strip(':')) for detail_addr in range(len(appellant_address_list)): locals()['appellant_address_' + str(detail_addr)] = appellant_address_list[detail_addr] # 被申请人住址: respondent_address_list = [] respondent_address_0 = respondent_address_1 = respondent_address_2 = respondent_address_3 = '' respondent_address_result = re.findall(r'被申请人(.*):.*住所地(.*)。', file2) # 法人住址,'住所地三字全' if respondent_address_result: for addr in respondent_address_result: respondent_address_list.append(addr.strip(':')) for detail_addr in range(len(respondent_address_list)): locals()['respondent_address_' + str(detail_addr)] = respondent_address_list[detail_addr] else: respondent_address_result = re.findall(r'被申请人(.*):.*住(.*)。', file2) # 自然人住址,只有'住' for addr in respondent_address_result: respondent_address_list.append(addr.strip(':')) for detail_addr in range(len(respondent_address_list)): locals()['respondent_address_' + str(detail_addr)] = respondent_address_list[detail_addr] # 被申请人中是否有法人 respondent_is_legal_person_list = [] respondent_is_legal_person = 0 ''' 0是自然人,1是公司法人,2是机关法人 ''' respondent_is_legal_person_result = re.findall(r'被申请人(.*):(.*),', file2) if respondent_is_legal_person_result: for person in respondent_is_legal_person_result: respondent_is_legal_person_list.append(person) for person2 in respondent_is_legal_person_list: if '公司' in person2: respondent_is_legal_person = 1 else: respondent_is_legal_person_result = re.findall(r'被申请人(.*):(.*)。', file2) for person in respondent_is_legal_person_result: respondent_is_legal_person_list.append(person) for person2 in respondent_is_legal_person_list: if '政府' in person2: respondent_is_legal_person = 2 # 取出再审申请人的一、二审地位 appellant_level_list = [] appellant_level_0 = appellant_level_1 = '' appellant_level_result = re.findall(r'再审申请人((.*)):', file2) if appellant_level_result: # appellant_level_list.append(appellant_level_result[0]) for detail in appellant_level_result[0].split('、'): appellant_level_list.append(detail) for get_detail in range(len(appellant_level_list)): locals()['appellant_level_' + str(get_detail)] = appellant_level_list[get_detail] with open('process_book' + '.csv', 'a+', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([ number, court_name, case_district, appellant_name_0, appellant_address_0, appellant_name_1, appellant_address_1, appellant_name_2, appellant_address_2, appellant_level_0, appellant_level_1, respondent_address_0, respondent_address_1, respondent_address_2, respondent_is_legal_person, appellant_have_lawyer, retrial_respondent_have_lawyer, collegiate_bench_num, chief_judge, judge_officer_0, judge_officer_1])
有什么好的python3爬虫入门教程或书籍吗?
Python网络爬虫比较容易学习,让人比较有成就感。下面我来说说我的看法,供大家参考参考:
1.精通Python网络爬虫 核心技术、框架与项目实战,韦玮(书籍,容易入门,但知识点不是很全)
2.用Python写网络爬虫(书籍,入门级)
3.Python爬虫开发与项目实战(书籍,知识点比较全,但对于没学过编程或者说编程能力不强的朋友来说,不容易学)
4. Python + 网络爬虫开发实战(书籍,比较难)
建议初学者以1,2为主要方向跟着学习,以3,4为辅助资料学习。
(小编有以上资料,需要的读者朋友私聊我)
以上是我的看法,希望对读者有帮助。
学了一个月的Python?
给自己一些题目和目标做起来很快速有效的学习,曾经我出过一个题目是石头剪刀布的游戏。由电脑随机生成石头剪刀布,玩家手动输入,玩家和电脑出拳进行比较操作输出比较结果。如果玩家赢则总分+1,输则-1,平局不得分。每次游戏都将电脑出拳,玩家出拳,结果和最后分数写进excel文件中。如果玩家输入的是拼音也可以正常玩游戏,如果输入的是石头剪刀布之外的直连提示报错。
就这样一个题目,做出来需要很多的知识储备,而且又围绕着一个题目做下去,就会觉得自己成就感很高。
该如何学习python?
我的专栏里面有制作python的入门课程,可以作为参考,这个课程是我在实际教学中制作的python入门课程,适合初学者和参加培训机构速成班之后,想进行一定系统的学习的学习者。然后就业前景,个人认为单种语言肯定是不足的,不过作为初学语言难度比较小,入门比较容易。对以后学习其他语言奠定基础。