forked from p46318075/CodePattern
parent
83c156a3d5
commit
44b0c00567
@ -1,2 +0,0 @@
|
||||
" my Some sure acquaintance or other, my dear, sure,other I suppose; I am sure I do not
|
||||
know. sure "
|
@ -1,29 +0,0 @@
|
||||
import sys
|
||||
from cppy.cp_util import *
|
||||
|
||||
## 切分任务这个工作,可以统一为一个通用函数。做成一个生成器!!
|
||||
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
testfile = os.path.join(script_dir, 'test.txt')
|
||||
stop_words = get_stopwords()
|
||||
|
||||
# 如果崩溃,把 5000 改下
|
||||
RECURSION_LIMIT = 5000
|
||||
sys.setrecursionlimit( RECURSION_LIMIT )
|
||||
|
||||
def count( i,chunks, stopwords, wordfreqs):
|
||||
if i < 0 : return
|
||||
for word in chunks[i]:
|
||||
if word not in stopwords:
|
||||
wordfreqs[word] = wordfreqs.get(word, 0) + 1
|
||||
count( i-1, chunks,stopwords, wordfreqs )
|
||||
|
||||
word_list = re_split( open(testfile,encoding='utf-8').read() )
|
||||
filesize = len( word_list )
|
||||
chunk_size = ( filesize // RECURSION_LIMIT ) + 1
|
||||
chunks = [ word_list[ x*chunk_size:(x+1)*RECURSION_LIMIT ]
|
||||
for x in range(chunk_size) ]
|
||||
word_freqs = {}
|
||||
count( chunk_size -1 ,chunks, stop_words, word_freqs )
|
||||
|
||||
print_word_freqs(sort_dict(word_freqs))
|
Loading…
Reference in new issue