CodePattern/计算设备/极限内存/tf-01.py

import sys, os, string
from cppy.cp_util import *

def touchopen(filename, *args, **kwargs):
    try:
        os.remove(filename)
    except OSError:
        pass
    open(filename, "a",encoding='utf-8').close() # "touch" file
    return open(filename, *args, **kwargs)

# The constrained memory should have no more than 1024*n cells
data = []
n = 10

f = open( stopwordfilepath,encoding='utf-8' )
data = [f.read(1024*n).split(',')] # data[0] holds the stop words
f.close()

data.append([])    # data[1] is line (max 80 characters)
data.append(None)  # data[2] is index of the start_char of word
data.append(0)     # data[3] is index on characters, i = 0
data.append(False) # data[4] is flag indicating if word was found
data.append('')    # data[5] is the word
data.append('')    # data[6] is word,NNNN
data.append(0)     # data[7] is frequency

# Open the secondary memory
word_freqs = touchopen('word_freqs', 'rb+')
# Open the input file
f = open( testfilepath , 'r',encoding='utf-8')
# Loop over input file's lines
while True:
    print('.',end='',flush = True)
    data[1] = [f.readline()] 
    if data[1] == ['']: # end of input file
        break
    if data[1][0][len(data[1][0])-1] != '\n': # If it does not end with \n
        data[1][0] = data[1][0] + '\n' # Add \n
    data[2] = None
    data[3] = 0 
    # Loop over characters in the line
    for c in data[1][0]: # elimination of symbol c is exercise
        if data[2] == None:
            if c.isalnum():
                # We found the start of a word
                data[2] = data[3]
        else:
            if not c.isalnum():
                # We found the end of a word. Process it
                data[4] = False 
                data[5] = data[1][0][data[2]:data[3]].lower()
                # Ignore words with len < 2, and stop words
                if len(data[5]) >= 2 and data[5] not in data[0]:
                    # Let's see if it already exists
                    while True:
                        data[6] = str(word_freqs.readline().strip(), 'utf-8')
                        if data[6] == '':
                            break;
                        data[7] = int(data[6].split(',')[1])
                        # word, no white space
                        data[6] = data[6].split(',')[0].strip() 
                        if data[5] == data[6]:
                            data[7] += 1
                            data[4] = True
                            break
                    if not data[4]:
                        word_freqs.seek(0, 1) # Needed in Windows
                        word_freqs.write(bytes("%20s,%04d\n" % (data[5], 1), 'utf-8'))
                    else:
                        word_freqs.seek(-26, 1)
                        word_freqs.write(bytes("%20s,%04d\n" % (data[5], data[7]), 'utf-8'))
                    word_freqs.seek(0,0)
                # Let's reset
                data[2] = None
        data[3] += 1
# We're done with the input file
f.close()
word_freqs.flush()

# PART 2
# Now we need to find the 25 most frequently occurring words.
# We don't need anything from the previous values in memory
del data[:]

# Let's use the first 25 entries for the top 25 words
data = data + [[]]*(25 - len(data))
data.append('') # data[25] is word,freq from file
data.append(0)  # data[26] is freq

# Loop over secondary memory file
while True:
    data[25] = str(word_freqs.readline().strip(), 'utf-8')
    if data[25] == '': # EOF
        break
    data[26] = int(data[25].split(',')[1]) # Read it as integer
    data[25] = data[25].split(',')[0].strip() # word
    # Check if this word has more counts than the ones in memory
    for i in range(25): # elimination of symbol i is exercise
        if data[i] == [] or data[i][1] < data[26]:
            data.insert(i, [data[25], data[26]]) 
            del data[26] #  delete the last element
            break
            
for tf in data[0:10]: 
    if len(tf) == 2:
        print(tf[0], '-', tf[1])

word_freqs.close()
01 9 months ago			`import sys, os, string`
			`from cppy.cp_util import *`

			`def touchopen(filename, args, *kwargs):`
			`try:`
			`os.remove(filename)`
			`except OSError:`
			`pass`
			`open(filename, "a",encoding='utf-8').close() # "touch" file`
			`return open(filename, args, *kwargs)`

			`# The constrained memory should have no more than 1024*n cells`
			`data = []`
			`n = 10`

			`f = open( stopwordfilepath,encoding='utf-8' )`
			`data = [f.read(1024*n).split(',')] # data[0] holds the stop words`
			`f.close()`

			`data.append([]) # data[1] is line (max 80 characters)`
			`data.append(None) # data[2] is index of the start_char of word`
			`data.append(0) # data[3] is index on characters, i = 0`
			`data.append(False) # data[4] is flag indicating if word was found`
			`data.append('') # data[5] is the word`
			`data.append('') # data[6] is word,NNNN`
			`data.append(0) # data[7] is frequency`

			`# Open the secondary memory`
			`word_freqs = touchopen('word_freqs', 'rb+')`
			`# Open the input file`
			`f = open( testfilepath , 'r',encoding='utf-8')`
			`# Loop over input file's lines`
			`while True:`
			`print('.',end='',flush = True)`
			`data[1] = [f.readline()]`
			`if data[1] == ['']: # end of input file`
			`break`
			`if data[1][0][len(data[1][0])-1] != '\n': # If it does not end with \n`
			`data[1][0] = data[1][0] + '\n' # Add \n`
			`data[2] = None`
			`data[3] = 0`
			`# Loop over characters in the line`
			`for c in data[1][0]: # elimination of symbol c is exercise`
			`if data[2] == None:`
			`if c.isalnum():`
			`# We found the start of a word`
			`data[2] = data[3]`
			`else:`
			`if not c.isalnum():`
			`# We found the end of a word. Process it`
			`data[4] = False`
			`data[5] = data[1][0][data[2]:data[3]].lower()`
			`# Ignore words with len < 2, and stop words`
			`if len(data[5]) >= 2 and data[5] not in data[0]:`
			`# Let's see if it already exists`
			`while True:`
			`data[6] = str(word_freqs.readline().strip(), 'utf-8')`
			`if data[6] == '':`
			`break;`
			`data[7] = int(data[6].split(',')[1])`
			`# word, no white space`
			`data[6] = data[6].split(',')[0].strip()`
			`if data[5] == data[6]:`
			`data[7] += 1`
			`data[4] = True`
			`break`
			`if not data[4]:`
			`word_freqs.seek(0, 1) # Needed in Windows`
			`word_freqs.write(bytes("%20s,%04d\n" % (data[5], 1), 'utf-8'))`
			`else:`
			`word_freqs.seek(-26, 1)`
			`word_freqs.write(bytes("%20s,%04d\n" % (data[5], data[7]), 'utf-8'))`
			`word_freqs.seek(0,0)`
			`# Let's reset`
			`data[2] = None`
			`data[3] += 1`
			`# We're done with the input file`
			`f.close()`
			`word_freqs.flush()`

			`# PART 2`
			`# Now we need to find the 25 most frequently occurring words.`
			`# We don't need anything from the previous values in memory`
			`del data[:]`

			`# Let's use the first 25 entries for the top 25 words`
			`data = data + [[]]*(25 - len(data))`
			`data.append('') # data[25] is word,freq from file`
			`data.append(0) # data[26] is freq`

			`# Loop over secondary memory file`
			`while True:`
			`data[25] = str(word_freqs.readline().strip(), 'utf-8')`
			`if data[25] == '': # EOF`
			`break`
			`data[26] = int(data[25].split(',')[1]) # Read it as integer`
			`data[25] = data[25].split(',')[0].strip() # word`
			`# Check if this word has more counts than the ones in memory`
			`for i in range(25): # elimination of symbol i is exercise`
			`if data[i] == [] or data[i][1] < data[26]:`
			`data.insert(i, [data[25], data[26]])`
			`del data[26] # delete the last element`
			`break`

			`for tf in data[0:10]:`
			`if len(tf) == 2:`
			`print(tf[0], '-', tf[1])`

			`word_freqs.close()`