import site import os,re import string,operator ################################################################################ # 变量 ################################################################################ testfilename = 'test.txt' testfilename = 'pride-and-prejudice.txt' testfilename = 'Prey.txt' db_filename = "tf.db" site_packages = site.getsitepackages() for package in site_packages: if 'package' in package: basePath = package stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt') testfilepath = os.path.join(basePath, 'cppy','data',testfilename ) ################################################################################ # 函数 ################################################################################ def read_file(path_to_file): with open(path_to_file,encoding='utf-8') as f: data = f.read() return data def re_split( data ): pattern = re.compile('[\W_]+') data = pattern.sub(' ', data).lower() return data.split() def get_stopwords( path_to_file = stopwordfilepath ): with open(path_to_file,encoding='utf-8') as f: data = f.read().split(',') data.extend(list(string.ascii_lowercase)) return data def extract_file_words(path_to_file): word_list = re_split( read_file(path_to_file) ) stop_words = get_stopwords() return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] def extract_str_words(data_str): word_list = re_split( data_str ) stop_words = get_stopwords() return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ] def count_word(word, word_freqs, stopwords): if word not in stopwords: word_freqs[word] = word_freqs.get(word, 0) + 1 def get_frequencies(word_list): word_freqs = {} for word in word_list: word_freqs[word] = word_freqs.get(word, 0) + 1 return word_freqs def sort_dict (word_freq): return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True) # return sorted( word_freq, key=lambda x: x[1], reverse=True ) def print_word_freqs( word_freqs, n = 10): for (w, c) in word_freqs[ :n ]: print( w, '-', c ) def test(): print( 'cppy welcome' )