CodePattern/cppy/cp_util.py


import site
import os,re
import string,operator

################################################################################
#  变量
################################################################################
testfilename = 'test.txt'
testfilename = 'pride-and-prejudice.txt'
testfilename = 'Prey.txt'

db_filename = "tf.db"  

site_packages = site.getsitepackages()
for package in site_packages:
    if 'package' in  package:
        basePath = package
stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt')
testfilepath = os.path.join(basePath, 'cppy','data',testfilename )


################################################################################
#  函数
################################################################################
def read_file(path_to_file):    
    with open(path_to_file,encoding='utf-8') as f:
        data = f.read()
    return data

def re_split( data ):
    pattern = re.compile('[\W_]+')
    data = pattern.sub(' ', data).lower()
    return data.split()

def get_stopwords( path_to_file = stopwordfilepath ):
    with open(path_to_file,encoding='utf-8') as f:
        data = f.read().split(',')        
    data.extend(list(string.ascii_lowercase))
    return data

def extract_file_words(path_to_file):
    word_list = re_split( read_file(path_to_file) )
    stop_words = get_stopwords()
    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]

def extract_str_words(data_str):
    word_list = re_split( data_str )
    stop_words = get_stopwords()
    return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]

def count_word(word, word_freqs, stopwords):
    if word not in stopwords:
        word_freqs[word] = word_freqs.get(word, 0) + 1

def get_frequencies(word_list):    
    word_freqs = {}  
    for word in word_list:  
        word_freqs[word] = word_freqs.get(word, 0) + 1    
    return word_freqs

def sort_dict (word_freq):
    return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)
    # return sorted( word_freq, key=lambda x: x[1], reverse=True )

def print_word_freqs( word_freqs, n = 10):
    for (w, c) in word_freqs[ :n ]:
        print( w, '-', c )


def  test():
    print( 'cppy welcome' )
3 9 months ago
			`import site`
			`import os,re`
			`import string,operator`

			`################################################################################`
			`# 变量`
			`################################################################################`
			`testfilename = 'test.txt'`
			`testfilename = 'pride-and-prejudice.txt'`
			`testfilename = 'Prey.txt'`

db 路径修正 9 months ago			`db_filename = "tf.db"`

3 9 months ago			`site_packages = site.getsitepackages()`
			`for package in site_packages:`
			`if 'package' in package:`
			`basePath = package`
			`stopwordfilepath = os.path.join(basePath, 'cppy','data','stop_words.txt')`
			`testfilepath = os.path.join(basePath, 'cppy','data',testfilename )`


			`################################################################################`
			`# 函数`
			`################################################################################`
			`def read_file(path_to_file):`
			`with open(path_to_file,encoding='utf-8') as f:`
			`data = f.read()`
			`return data`

			`def re_split( data ):`
			`pattern = re.compile('[\W_]+')`
			`data = pattern.sub(' ', data).lower()`
			`return data.split()`

			`def get_stopwords( path_to_file = stopwordfilepath ):`
			`with open(path_to_file,encoding='utf-8') as f:`
			`data = f.read().split(',')`
			`data.extend(list(string.ascii_lowercase))`
			`return data`

			`def extract_file_words(path_to_file):`
			`word_list = re_split( read_file(path_to_file) )`
			`stop_words = get_stopwords()`
			`return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]`

			`def extract_str_words(data_str):`
			`word_list = re_split( data_str )`
			`stop_words = get_stopwords()`
			`return [ w for w in word_list if ( not w in stop_words ) and len(w) >= 3 ]`

			`def count_word(word, word_freqs, stopwords):`
			`if word not in stopwords:`
			`word_freqs[word] = word_freqs.get(word, 0) + 1`

			`def get_frequencies(word_list):`
			`word_freqs = {}`
			`for word in word_list:`
			`word_freqs[word] = word_freqs.get(word, 0) + 1`
			`return word_freqs`

			`def sort_dict (word_freq):`
			`return sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)`
			`# return sorted( word_freq, key=lambda x: x[1], reverse=True )`

			`def print_word_freqs( word_freqs, n = 10):`
			`for (w, c) in word_freqs[ :n ]:`
			`print( w, '-', c )`


			`def test():`
			`print( 'cppy welcome' )`