You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
29 lines
1004 B
29 lines
1004 B
import sys
|
|
from cppy.cp_util import *
|
|
|
|
## 切分任务这个工作,可以统一为一个通用函数。做成一个生成器!!
|
|
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
testfile = os.path.join(script_dir, 'test.txt')
|
|
stop_words = get_stopwords()
|
|
|
|
# 如果崩溃,把 5000 改下
|
|
RECURSION_LIMIT = 5000
|
|
sys.setrecursionlimit( RECURSION_LIMIT )
|
|
|
|
def count( i,chunks, stopwords, wordfreqs):
|
|
if i < 0 : return
|
|
for word in chunks[i]:
|
|
if word not in stopwords:
|
|
wordfreqs[word] = wordfreqs.get(word, 0) + 1
|
|
count( i-1, chunks,stopwords, wordfreqs )
|
|
|
|
word_list = re_split( open(testfile,encoding='utf-8').read() )
|
|
filesize = len( word_list )
|
|
chunk_size = ( filesize // RECURSION_LIMIT ) + 1
|
|
chunks = [ word_list[ x*chunk_size:(x+1)*RECURSION_LIMIT ]
|
|
for x in range(chunk_size) ]
|
|
word_freqs = {}
|
|
count( chunk_size -1 ,chunks, stop_words, word_freqs )
|
|
|
|
print_word_freqs(sort_dict(word_freqs)) |