parent
31a4dfc8e5
commit
83c156a3d5
@ -1,29 +1,3 @@
|
||||
import re
|
||||
from cppy.cp_util import *
|
||||
|
||||
|
||||
def extractwords(str_data):
|
||||
pattern = re.compile('[\W_]+')
|
||||
word_list = pattern.sub(' ', str_data).lower().split()
|
||||
stop_words = get_stopwords()
|
||||
return [w for w in word_list if not w in stop_words]
|
||||
|
||||
def frequencies(word_list):
|
||||
word_freqs = {}
|
||||
for word in word_list:
|
||||
word_freqs[word] = word_freqs.get(word, 0) + 1
|
||||
return word_freqs
|
||||
|
||||
def sort(word_freq):
|
||||
return sorted( word_freq.items(), key=lambda x: x[1], reverse=True )
|
||||
|
||||
def printall(word_freqs, n = 10 ):
|
||||
for word, freq in word_freqs[ :n ]:
|
||||
print(word, '-', freq)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
printall(sort(frequencies(
|
||||
extractwords(
|
||||
read_file( testfilepath ))))
|
||||
)
|
||||
print_word_freqs( sort_dict ( get_frequencies ( extract_file_words(testfilepath) )))
|
@ -0,0 +1,46 @@
|
||||
import threading, queue
|
||||
from cppy.cp_util import *
|
||||
from collections import Counter
|
||||
|
||||
stop_words = get_stopwords()
|
||||
|
||||
# 待处理数据放一个队列,多个线程轮流计数,最后合并统一计数
|
||||
class WordFrequencyCounter:
|
||||
def __init__(self, input_file):
|
||||
self.word_space = queue.Queue()
|
||||
self.freq_space = queue.Queue()
|
||||
for chunk in get_chunks(input_file,3000):
|
||||
self.word_space.put(chunk)
|
||||
|
||||
def process_words(self):
|
||||
while not self.word_space.empty():
|
||||
try:
|
||||
chunk = self.word_space.get_nowait() # 不使用超时,持续获取数据
|
||||
except queue.Empty:
|
||||
break # 队列为空,退出循环
|
||||
# print(f"Worker thread ID: {threading.get_ident()}",len(chunk))
|
||||
words = [ w for w in chunk if w not in stop_words and len(w) >= 3 ]
|
||||
word_freqs = Counter(words)
|
||||
self.freq_space.put(dict(word_freqs)) # 将Counter对象转换为字典
|
||||
|
||||
def run(self):
|
||||
workers = [ threading.Thread(target=self.process_words) for _ in range(5)]
|
||||
for worker in workers: worker.start()
|
||||
for worker in workers: worker.join()
|
||||
|
||||
word_freqs = Counter() # 初始化一个空的Counter对象
|
||||
while not self.freq_space.empty():
|
||||
freqs = self.freq_space.get()
|
||||
if freqs: # 确保freqs非空
|
||||
word_freqs.update(freqs)
|
||||
|
||||
print_word_freqs ( sort_dict (word_freqs) )
|
||||
|
||||
|
||||
@timing_decorator
|
||||
def main():
|
||||
counter = WordFrequencyCounter( testfilepath )
|
||||
counter.run()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,37 +0,0 @@
|
||||
import threading, queue
|
||||
from cppy.cp_util import *
|
||||
|
||||
class WordFrequencyCounter:
|
||||
def __init__(self, input_file):
|
||||
self.word_space = queue.Queue()
|
||||
self.freq_space = queue.Queue()
|
||||
for word in extract_file_words(input_file):
|
||||
self.word_space.put(word)
|
||||
|
||||
def process_words(self):
|
||||
word_freqs = {}
|
||||
while not self.word_space.empty():
|
||||
try:
|
||||
word = self.word_space.get(timeout=1)
|
||||
word_freqs[word] = word_freqs.get(word, 0) + 1
|
||||
except queue.Empty:
|
||||
break
|
||||
self.freq_space.put(word_freqs)
|
||||
|
||||
def run(self):
|
||||
workers = [threading.Thread(target=self.process_words) for _ in range(5)]
|
||||
for worker in workers: worker.start()
|
||||
for worker in workers: worker.join()
|
||||
|
||||
word_freqs = {}
|
||||
while not self.freq_space.empty():
|
||||
freqs = self.freq_space.get()
|
||||
for word, count in freqs.items():
|
||||
word_freqs[word] = word_freqs.get(word, 0) + count
|
||||
|
||||
print_word_freqs ( sort_dict (word_freqs) )
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
counter = WordFrequencyCounter( testfilepath )
|
||||
counter.run()
|
Loading…
Reference in new issue