parent
ebe28f7670
commit
88606f2bce
@ -1,6 +1,6 @@
|
|||||||
import operator, collections
|
import collections
|
||||||
|
|
||||||
def top25(word_list):
|
def top_word(word_list):
|
||||||
counts = collections.Counter(w for w in word_list)
|
counts = collections.Counter(w for w in word_list)
|
||||||
return counts.most_common(10)
|
return counts.most_common(10)
|
||||||
|
|
||||||
|
@ -1,14 +0,0 @@
|
|||||||
import sys, re, string
|
|
||||||
from cppy.cp_util import *
|
|
||||||
|
|
||||||
def extract_words(path_to_file):
|
|
||||||
with open(path_to_file,encoding='utf-8') as f:
|
|
||||||
str_data = f.read()
|
|
||||||
pattern = re.compile('[\W_]+')
|
|
||||||
word_list = pattern.sub(' ', str_data).lower().split()
|
|
||||||
|
|
||||||
|
|
||||||
stop_words = get_stopwords()
|
|
||||||
|
|
||||||
return [w for w in word_list if not w in stop_words]
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
|||||||
import sys, re, string
|
|
||||||
from cppy.cp_util import *
|
|
||||||
|
|
||||||
def extract_words(path_to_file):
|
|
||||||
words = re.findall('[a-z]{2,}', open(path_to_file,encoding='utf-8').read().lower())
|
|
||||||
stopwords = get_stopwords()
|
|
||||||
return [w for w in words if w not in stopwords]
|
|
||||||
|
|
Binary file not shown.
Binary file not shown.
@ -1,19 +1,19 @@
|
|||||||
import configparser, importlib.machinery
|
import configparser, importlib.machinery
|
||||||
from cppy.cp_util import *
|
from cppy.cp_util import *
|
||||||
|
|
||||||
|
|
||||||
def load_plugins():
|
def load_plugins():
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
os.chdir(script_dir)
|
os.chdir(script_dir)
|
||||||
config.read("config.ini")
|
config.read("config.ini")
|
||||||
words_plugin = config.get("Plugins", "words")
|
|
||||||
frequencies_plugin = config.get("Plugins", "frequencies")
|
frequencies_plugin = config.get("Plugins", "frequencies")
|
||||||
global tfwords, tffreqs
|
|
||||||
tfwords = importlib.machinery.SourcelessFileLoader('tfwords', words_plugin).load_module()
|
global get_frequencies
|
||||||
tffreqs = importlib.machinery.SourcelessFileLoader('tffreqs', frequencies_plugin).load_module()
|
get_frequencies = importlib.machinery.SourcelessFileLoader('tffreqs', frequencies_plugin).load_module()
|
||||||
|
|
||||||
|
|
||||||
load_plugins()
|
load_plugins()
|
||||||
word_freqs = tffreqs.top25(tfwords.extract_words( testfilepath ))
|
|
||||||
|
|
||||||
|
wordlist = extract_file_words( testfilepath )
|
||||||
|
word_freqs = get_frequencies.top_word( wordlist )
|
||||||
print_word_freqs(word_freqs)
|
print_word_freqs(word_freqs)
|
Loading…
Reference in new issue