python3 和 python2 的语法差异应该是最蛋疼的事情了
dict本来就是没有顺序的吧
把dict转换成list
再去排序就会比较好了
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import jieba
import csv
def dict2list(dic:dict):
# 将字典转化为列表
keys = dic.keys()
vals = dic.values()
lst = [(key, val) for key, val in zip(keys, vals)]
return lst
csv_reader = csv.reader(open('/Users/dear_jinx/Desktop/zz.csv', 'U'))
dic = []
for row in csv_reader:
# seg_list = jieba.cut_for_search(row[4])
seg_list = jieba.cut(row[4])
for x in seg_list:
dic.append(x)
word = {}
for i in dic:
if i not in word:
word[i] = 1
else:
word[i] += 1
list = sorted(dict2list(word), key=lambda x: x[1], reverse=False)
for x in list:
print(x)
# for item in word.items():
# print(item)
# print("/".join(dic))
上面的方法太繁琐了,并且分词的效果也不好,会出现一些符号的统计。
这里我们只在列表里面加入那些长度大于等于2的词
并且用counter去做统计
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import jieba
import csv
from collections import Counter
def dict2list(dic:dict):
# 将字典转化为列表
keys = dic.keys()
vals = dic.values()
lst = [(key, val) for key, val in zip(keys, vals)]
return lst
csv_reader = csv.reader(open('/Users/dear_jinx/Desktop/zz.csv', 'U'))
dic = []
for row in csv_reader:
# seg_list = jieba.cut_for_search(row[4])
seg_list = jieba.cut(row[4])
for x in seg_list:
if len(x) >= 2:
dic.append(x)
c = Counter(dic).most_common(20)
print(c)