You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
109 lines
3.9 KiB
109 lines
3.9 KiB
9 months ago
|
import sys, os, string
|
||
|
from cppy.cp_util import *
|
||
|
|
||
|
def touchopen(filename, *args, **kwargs):
|
||
|
try:
|
||
|
os.remove(filename)
|
||
|
except OSError:
|
||
|
pass
|
||
|
open(filename, "a",encoding='utf-8').close() # "touch" file
|
||
|
return open(filename, *args, **kwargs)
|
||
|
|
||
|
# The constrained memory should have no more than 1024*n cells
|
||
|
data = []
|
||
|
n = 10
|
||
|
|
||
|
f = open( stopwordfilepath,encoding='utf-8' )
|
||
|
data = [f.read(1024*n).split(',')] # data[0] holds the stop words
|
||
|
f.close()
|
||
|
|
||
|
data.append([]) # data[1] is line (max 80 characters)
|
||
|
data.append(None) # data[2] is index of the start_char of word
|
||
|
data.append(0) # data[3] is index on characters, i = 0
|
||
|
data.append(False) # data[4] is flag indicating if word was found
|
||
|
data.append('') # data[5] is the word
|
||
|
data.append('') # data[6] is word,NNNN
|
||
|
data.append(0) # data[7] is frequency
|
||
|
|
||
|
# Open the secondary memory
|
||
|
word_freqs = touchopen('word_freqs', 'rb+')
|
||
|
# Open the input file
|
||
|
f = open( testfilepath , 'r',encoding='utf-8')
|
||
|
# Loop over input file's lines
|
||
|
while True:
|
||
|
print('.',end='',flush = True)
|
||
|
data[1] = [f.readline()]
|
||
|
if data[1] == ['']: # end of input file
|
||
|
break
|
||
|
if data[1][0][len(data[1][0])-1] != '\n': # If it does not end with \n
|
||
|
data[1][0] = data[1][0] + '\n' # Add \n
|
||
|
data[2] = None
|
||
|
data[3] = 0
|
||
|
# Loop over characters in the line
|
||
|
for c in data[1][0]: # elimination of symbol c is exercise
|
||
|
if data[2] == None:
|
||
|
if c.isalnum():
|
||
|
# We found the start of a word
|
||
|
data[2] = data[3]
|
||
|
else:
|
||
|
if not c.isalnum():
|
||
|
# We found the end of a word. Process it
|
||
|
data[4] = False
|
||
|
data[5] = data[1][0][data[2]:data[3]].lower()
|
||
|
# Ignore words with len < 2, and stop words
|
||
|
if len(data[5]) >= 2 and data[5] not in data[0]:
|
||
|
# Let's see if it already exists
|
||
|
while True:
|
||
|
data[6] = str(word_freqs.readline().strip(), 'utf-8')
|
||
|
if data[6] == '':
|
||
|
break;
|
||
|
data[7] = int(data[6].split(',')[1])
|
||
|
# word, no white space
|
||
|
data[6] = data[6].split(',')[0].strip()
|
||
|
if data[5] == data[6]:
|
||
|
data[7] += 1
|
||
|
data[4] = True
|
||
|
break
|
||
|
if not data[4]:
|
||
|
word_freqs.seek(0, 1) # Needed in Windows
|
||
|
word_freqs.write(bytes("%20s,%04d\n" % (data[5], 1), 'utf-8'))
|
||
|
else:
|
||
|
word_freqs.seek(-26, 1)
|
||
|
word_freqs.write(bytes("%20s,%04d\n" % (data[5], data[7]), 'utf-8'))
|
||
|
word_freqs.seek(0,0)
|
||
|
# Let's reset
|
||
|
data[2] = None
|
||
|
data[3] += 1
|
||
|
# We're done with the input file
|
||
|
f.close()
|
||
|
word_freqs.flush()
|
||
|
|
||
|
# PART 2
|
||
|
# Now we need to find the 25 most frequently occurring words.
|
||
|
# We don't need anything from the previous values in memory
|
||
|
del data[:]
|
||
|
|
||
|
# Let's use the first 25 entries for the top 25 words
|
||
|
data = data + [[]]*(25 - len(data))
|
||
|
data.append('') # data[25] is word,freq from file
|
||
|
data.append(0) # data[26] is freq
|
||
|
|
||
|
# Loop over secondary memory file
|
||
|
while True:
|
||
|
data[25] = str(word_freqs.readline().strip(), 'utf-8')
|
||
|
if data[25] == '': # EOF
|
||
|
break
|
||
|
data[26] = int(data[25].split(',')[1]) # Read it as integer
|
||
|
data[25] = data[25].split(',')[0].strip() # word
|
||
|
# Check if this word has more counts than the ones in memory
|
||
|
for i in range(25): # elimination of symbol i is exercise
|
||
|
if data[i] == [] or data[i][1] < data[26]:
|
||
|
data.insert(i, [data[25], data[26]])
|
||
|
del data[26] # delete the last element
|
||
|
break
|
||
|
|
||
|
for tf in data[0:10]:
|
||
|
if len(tf) == 2:
|
||
|
print(tf[0], '-', tf[1])
|
||
|
|
||
|
word_freqs.close()
|