parent
41a14b6705
commit
7db531d2fc
@ -1,56 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
|
||||
#
|
||||
# 多线程
|
||||
#
|
||||
def process_chunk(chunk):
|
||||
# 过滤停用词
|
||||
stop_words = get_stopwords()
|
||||
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
||||
return Counter(words)
|
||||
|
||||
|
||||
def merge_counts(counts_list):
|
||||
# 合并多个Counter对象
|
||||
total_counts = Counter()
|
||||
for counts in counts_list:
|
||||
total_counts += counts
|
||||
return total_counts
|
||||
|
||||
|
||||
def thread_function(chunk, counts_list):
|
||||
word_count = process_chunk(chunk)
|
||||
counts_list.append(word_count)
|
||||
|
||||
|
||||
@timing_decorator
|
||||
def main():
|
||||
# 读取文件内容
|
||||
content = re_split(read_file(testfilepath))
|
||||
chunk_size = 1000 # 可以根据实际情况调整块大小
|
||||
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
|
||||
|
||||
# 使用多线程池,每个线程处理一个块
|
||||
pool = ThreadPool(len(content)//chunk_size+1)
|
||||
counts_list = pool.map(process_chunk, chunks)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# 合并计数
|
||||
total_counts = merge_counts(counts_list)
|
||||
|
||||
# 输出最高频的n个词
|
||||
print_word_freqs(total_counts.most_common(10))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
from multiprocessing.pool import ThreadPool
|
||||
|
||||
|
||||
#
|
||||
# 多线程
|
||||
#
|
||||
def process_chunk(chunk):
|
||||
# 过滤停用词
|
||||
stop_words = get_stopwords()
|
||||
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
||||
return Counter(words)
|
||||
|
||||
|
||||
def merge_counts(counts_list):
|
||||
# 合并多个Counter对象
|
||||
total_counts = Counter()
|
||||
for counts in counts_list:
|
||||
total_counts += counts
|
||||
return total_counts
|
||||
|
||||
|
||||
def thread_function(chunk, counts_list):
|
||||
word_count = process_chunk(chunk)
|
||||
counts_list.append(word_count)
|
||||
|
||||
|
||||
@timing_decorator
|
||||
def main():
|
||||
# 读取文件内容
|
||||
content = re_split(read_file(testfilepath))
|
||||
chunk_size = 1000 # 可以根据实际情况调整块大小
|
||||
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
|
||||
|
||||
# 使用多线程池,每个线程处理一个块
|
||||
pool = ThreadPool(len(content)//chunk_size+1)
|
||||
counts_list = pool.map(process_chunk, chunks)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# 合并计数
|
||||
total_counts = merge_counts(counts_list)
|
||||
|
||||
# 输出最高频的n个词
|
||||
print_word_freqs(total_counts.most_common(10))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -1,49 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import multiprocessing
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
|
||||
|
||||
#
|
||||
# 多进程
|
||||
#
|
||||
def process_chunk(chunk):
|
||||
# 过滤停用词
|
||||
stop_words = get_stopwords()
|
||||
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
||||
return Counter(words)
|
||||
|
||||
|
||||
def merge_counts(counts_list):
|
||||
# 合并多个Counter对象
|
||||
total_counts = Counter()
|
||||
for counts in counts_list:
|
||||
total_counts += counts
|
||||
return total_counts
|
||||
|
||||
|
||||
@timing_decorator
|
||||
def main():
|
||||
# 读取文件内容
|
||||
content = re_split(read_file(testfilepath))
|
||||
|
||||
# 分割文件内容为多个块,每个块由一个进程处理
|
||||
chunk_size = 1000 # 可以根据实际情况调整块大小
|
||||
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
|
||||
|
||||
# 使用多进程处理每个块
|
||||
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
||||
counts_list = pool.map(process_chunk, chunks)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# 合并计数
|
||||
total_counts = merge_counts(counts_list)
|
||||
|
||||
# 输出最高频的n个词
|
||||
print_word_freqs(total_counts.most_common(10))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import multiprocessing
|
||||
from collections import Counter
|
||||
from cppy.cp_util import *
|
||||
|
||||
|
||||
#
|
||||
# 多进程
|
||||
#
|
||||
def process_chunk(chunk):
|
||||
# 过滤停用词
|
||||
stop_words = get_stopwords()
|
||||
words = [ w for w in chunk if ( not w in stop_words ) and len(w) >= 3 ]
|
||||
return Counter(words)
|
||||
|
||||
|
||||
def merge_counts(counts_list):
|
||||
# 合并多个Counter对象
|
||||
total_counts = Counter()
|
||||
for counts in counts_list:
|
||||
total_counts += counts
|
||||
return total_counts
|
||||
|
||||
|
||||
@timing_decorator
|
||||
def main():
|
||||
# 读取文件内容
|
||||
content = re_split(read_file(testfilepath))
|
||||
|
||||
# 分割文件内容为多个块,每个块由一个进程处理
|
||||
chunk_size = 1000 # 可以根据实际情况调整块大小
|
||||
chunks = [content[i:i + chunk_size] for i in range(0, len(content), chunk_size)]
|
||||
|
||||
# 使用多进程处理每个块
|
||||
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
|
||||
counts_list = pool.map(process_chunk, chunks)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# 合并计数
|
||||
total_counts = merge_counts(counts_list)
|
||||
|
||||
# 输出最高频的n个词
|
||||
print_word_freqs(total_counts.most_common(10))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -1,59 +1,59 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import cppy.cp_util as util
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class WordFrequencyStateMachine:
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
self.content = None
|
||||
self.words = None
|
||||
self.word_freq = None
|
||||
self.state = 'IDLE'
|
||||
|
||||
def transition_to_read_file(self):
|
||||
try:
|
||||
with open(self.file_path, 'r', encoding='utf-8') as file:
|
||||
self.content = file.read()
|
||||
self.state = 'WORDS_SPLIT'
|
||||
except FileNotFoundError:
|
||||
print(f"文件 {self.file_path} 未找到。")
|
||||
except Exception as e:
|
||||
print(f"读取文件时发生错误: {e}")
|
||||
|
||||
def transition_to_split_words(self):
|
||||
if self.content is not None:
|
||||
self.words = util.extract_str_words(self.content)
|
||||
self.state = 'CALCULATE_FREQ'
|
||||
else:
|
||||
print("文件内容为空,无法分割单词。")
|
||||
|
||||
def transition_to_calculate_freq(self):
|
||||
if self.words is not None:
|
||||
self.word_freq = Counter(self.words)
|
||||
self.state = 'DONE'
|
||||
else:
|
||||
print("单词列表为空,无法计算词频。")
|
||||
|
||||
def run(self):
|
||||
while self.state != 'DONE':
|
||||
if self.state == 'IDLE':
|
||||
self.transition_to_read_file()
|
||||
elif self.state == 'WORDS_SPLIT':
|
||||
self.transition_to_split_words()
|
||||
elif self.state == 'CALCULATE_FREQ':
|
||||
self.transition_to_calculate_freq()
|
||||
else:
|
||||
print(f"未知状态: {self.state}")
|
||||
break
|
||||
|
||||
return self.word_freq
|
||||
|
||||
# 使用状态机计算词频
|
||||
|
||||
|
||||
state_machine = WordFrequencyStateMachine(util.testfilepath)
|
||||
word_frequencies = state_machine.run()
|
||||
|
||||
# 打印结果
|
||||
# -*- coding: utf-8 -*-
|
||||
import cppy.cp_util as util
|
||||
from collections import Counter
|
||||
|
||||
|
||||
class WordFrequencyStateMachine:
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
self.content = None
|
||||
self.words = None
|
||||
self.word_freq = None
|
||||
self.state = 'IDLE'
|
||||
|
||||
def transition_to_read_file(self):
|
||||
try:
|
||||
with open(self.file_path, 'r', encoding='utf-8') as file:
|
||||
self.content = file.read()
|
||||
self.state = 'WORDS_SPLIT'
|
||||
except FileNotFoundError:
|
||||
print(f"文件 {self.file_path} 未找到。")
|
||||
except Exception as e:
|
||||
print(f"读取文件时发生错误: {e}")
|
||||
|
||||
def transition_to_split_words(self):
|
||||
if self.content is not None:
|
||||
self.words = util.extract_str_words(self.content)
|
||||
self.state = 'CALCULATE_FREQ'
|
||||
else:
|
||||
print("文件内容为空,无法分割单词。")
|
||||
|
||||
def transition_to_calculate_freq(self):
|
||||
if self.words is not None:
|
||||
self.word_freq = Counter(self.words)
|
||||
self.state = 'DONE'
|
||||
else:
|
||||
print("单词列表为空,无法计算词频。")
|
||||
|
||||
def run(self):
|
||||
while self.state != 'DONE':
|
||||
if self.state == 'IDLE':
|
||||
self.transition_to_read_file()
|
||||
elif self.state == 'WORDS_SPLIT':
|
||||
self.transition_to_split_words()
|
||||
elif self.state == 'CALCULATE_FREQ':
|
||||
self.transition_to_calculate_freq()
|
||||
else:
|
||||
print(f"未知状态: {self.state}")
|
||||
break
|
||||
|
||||
return self.word_freq
|
||||
|
||||
# 使用状态机计算词频
|
||||
|
||||
|
||||
state_machine = WordFrequencyStateMachine(util.testfilepath)
|
||||
word_frequencies = state_machine.run()
|
||||
|
||||
# 打印结果
|
||||
util.print_word_freqs(word_frequencies.most_common(10))
|
@ -1,33 +1,33 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import cppy.cp_util as util
|
||||
|
||||
# 每一列是一个数据元素和一个公式,第一列是输入数据,所以没有公式
|
||||
all_words = [(), None]
|
||||
non_stop_words = [(), util.extract_str_words]
|
||||
frequencies = [(), util.get_frequencies]
|
||||
sorted_data = [(), util.sort_dict]
|
||||
|
||||
# 整个电子表格
|
||||
all_columns = [all_words, non_stop_words,\
|
||||
frequencies, sorted_data]
|
||||
|
||||
# 每次输入数据后调用此方法
|
||||
def update():
|
||||
global all_columns
|
||||
for c in all_columns[1::]:
|
||||
if c[1] == util.extract_str_words:
|
||||
c[0] = c[1](all_words[0])
|
||||
elif c[1] == util.get_frequencies:
|
||||
c[0] = c[1](non_stop_words[0])
|
||||
elif c[1] == util.sort_dict:
|
||||
c[0] = c[1](frequencies[0])
|
||||
|
||||
# 将固定数据加载到第一列中
|
||||
all_words[0] = util.read_file(util.testfilepath)
|
||||
# 调用update函数遍历列表
|
||||
update()
|
||||
|
||||
#打印结果
|
||||
util.print_word_freqs(sorted_data[0])
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import cppy.cp_util as util
|
||||
|
||||
# 每一列是一个数据元素和一个公式,第一列是输入数据,所以没有公式
|
||||
all_words = [(), None]
|
||||
non_stop_words = [(), util.extract_str_words]
|
||||
frequencies = [(), util.get_frequencies]
|
||||
sorted_data = [(), util.sort_dict]
|
||||
|
||||
# 整个电子表格
|
||||
all_columns = [all_words, non_stop_words,\
|
||||
frequencies, sorted_data]
|
||||
|
||||
# 每次输入数据后调用此方法
|
||||
def update():
|
||||
global all_columns
|
||||
for c in all_columns[1::]:
|
||||
if c[1] == util.extract_str_words:
|
||||
c[0] = c[1](all_words[0])
|
||||
elif c[1] == util.get_frequencies:
|
||||
c[0] = c[1](non_stop_words[0])
|
||||
elif c[1] == util.sort_dict:
|
||||
c[0] = c[1](frequencies[0])
|
||||
|
||||
# 将固定数据加载到第一列中
|
||||
all_words[0] = util.read_file(util.testfilepath)
|
||||
# 调用update函数遍历列表
|
||||
update()
|
||||
|
||||
#打印结果
|
||||
util.print_word_freqs(sorted_data[0])
|
||||
|
||||
|
Loading…
Reference in new issue