You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

29 lines
1004 B

9 months ago
import sys
from cppy.cp_util import *
8 months ago
## 切分任务这个工作,可以统一为一个通用函数。做成一个生成器!!
8 months ago
9 months ago
script_dir = os.path.dirname(os.path.abspath(__file__))
testfile = os.path.join(script_dir, 'test.txt')
stop_words = get_stopwords()
# 如果崩溃,把 5000 改下
RECURSION_LIMIT = 5000
sys.setrecursionlimit( RECURSION_LIMIT )
def count( i,chunks, stopwords, wordfreqs):
if i < 0 : return
for word in chunks[i]:
if word not in stopwords:
wordfreqs[word] = wordfreqs.get(word, 0) + 1
count( i-1, chunks,stopwords, wordfreqs )
word_list = re_split( open(testfile,encoding='utf-8').read() )
filesize = len( word_list )
chunk_size = ( filesize // RECURSION_LIMIT ) + 1
chunks = [ word_list[ x*chunk_size:(x+1)*RECURSION_LIMIT ]
for x in range(chunk_size) ]
word_freqs = {}
count( chunk_size -1 ,chunks, stop_words, word_freqs )
print_word_freqs(sort_dict(word_freqs))