parent
							
								
									fe94d8ed1b
								
							
						
					
					
						commit
						b86f626e94
					
				| @ -1,109 +0,0 @@ | ||||
| import sys, os, string | ||||
| from cppy.cp_util import * | ||||
| 
 | ||||
| def touchopen(filename, *args, **kwargs): | ||||
|     try: | ||||
|         os.remove(filename) | ||||
|     except OSError: | ||||
|         pass | ||||
|     open(filename, "a",encoding='utf-8').close() # "touch" file | ||||
|     return open(filename, *args, **kwargs) | ||||
| 
 | ||||
| # The constrained memory should have no more than 1024*n cells | ||||
| data = [] | ||||
| n = 10 | ||||
| 
 | ||||
| f = open( stopwordfilepath,encoding='utf-8' ) | ||||
| data = [f.read(1024*n).split(',')] # data[0] holds the stop words | ||||
| f.close() | ||||
| 
 | ||||
| data.append([])    # data[1] is line (max 80 characters) | ||||
| data.append(None)  # data[2] is index of the start_char of word | ||||
| data.append(0)     # data[3] is index on characters, i = 0 | ||||
| data.append(False) # data[4] is flag indicating if word was found | ||||
| data.append('')    # data[5] is the word | ||||
| data.append('')    # data[6] is word,NNNN | ||||
| data.append(0)     # data[7] is frequency | ||||
| 
 | ||||
| # Open the secondary memory | ||||
| word_freqs = touchopen('word_freqs', 'rb+') | ||||
| # Open the input file | ||||
| f = open( testfilepath , 'r',encoding='utf-8') | ||||
| # Loop over input file's lines | ||||
| while True: | ||||
|     print('.',end='',flush = True) | ||||
|     data[1] = [f.readline()]  | ||||
|     if data[1] == ['']: # end of input file | ||||
|         break | ||||
|     if data[1][0][len(data[1][0])-1] != '\n': # If it does not end with \n | ||||
|         data[1][0] = data[1][0] + '\n' # Add \n | ||||
|     data[2] = None | ||||
|     data[3] = 0  | ||||
|     # Loop over characters in the line | ||||
|     for c in data[1][0]: # elimination of symbol c is exercise | ||||
|         if data[2] == None: | ||||
|             if c.isalnum(): | ||||
|                 # We found the start of a word | ||||
|                 data[2] = data[3] | ||||
|         else: | ||||
|             if not c.isalnum(): | ||||
|                 # We found the end of a word. Process it | ||||
|                 data[4] = False  | ||||
|                 data[5] = data[1][0][data[2]:data[3]].lower() | ||||
|                 # Ignore words with len < 2, and stop words | ||||
|                 if len(data[5]) >= 2 and data[5] not in data[0]: | ||||
|                     # Let's see if it already exists | ||||
|                     while True: | ||||
|                         data[6] = str(word_freqs.readline().strip(), 'utf-8') | ||||
|                         if data[6] == '': | ||||
|                             break; | ||||
|                         data[7] = int(data[6].split(',')[1]) | ||||
|                         # word, no white space | ||||
|                         data[6] = data[6].split(',')[0].strip()  | ||||
|                         if data[5] == data[6]: | ||||
|                             data[7] += 1 | ||||
|                             data[4] = True | ||||
|                             break | ||||
|                     if not data[4]: | ||||
|                         word_freqs.seek(0, 1) # Needed in Windows | ||||
|                         word_freqs.write(bytes("%20s,%04d\n" % (data[5], 1), 'utf-8')) | ||||
|                     else: | ||||
|                         word_freqs.seek(-26, 1) | ||||
|                         word_freqs.write(bytes("%20s,%04d\n" % (data[5], data[7]), 'utf-8')) | ||||
|                     word_freqs.seek(0,0) | ||||
|                 # Let's reset | ||||
|                 data[2] = None | ||||
|         data[3] += 1 | ||||
| # We're done with the input file | ||||
| f.close() | ||||
| word_freqs.flush() | ||||
| 
 | ||||
| # PART 2 | ||||
| # Now we need to find the 25 most frequently occurring words. | ||||
| # We don't need anything from the previous values in memory | ||||
| del data[:] | ||||
| 
 | ||||
| # Let's use the first 25 entries for the top 25 words | ||||
| data = data + [[]]*(25 - len(data)) | ||||
| data.append('') # data[25] is word,freq from file | ||||
| data.append(0)  # data[26] is freq | ||||
| 
 | ||||
| # Loop over secondary memory file | ||||
| while True: | ||||
|     data[25] = str(word_freqs.readline().strip(), 'utf-8') | ||||
|     if data[25] == '': # EOF | ||||
|         break | ||||
|     data[26] = int(data[25].split(',')[1]) # Read it as integer | ||||
|     data[25] = data[25].split(',')[0].strip() # word | ||||
|     # Check if this word has more counts than the ones in memory | ||||
|     for i in range(25): # elimination of symbol i is exercise | ||||
|         if data[i] == [] or data[i][1] < data[26]: | ||||
|             data.insert(i, [data[25], data[26]])  | ||||
|             del data[26] #  delete the last element | ||||
|             break | ||||
|              | ||||
| for tf in data[0:10]:  | ||||
|     if len(tf) == 2: | ||||
|         print(tf[0], '-', tf[1]) | ||||
| 
 | ||||
| word_freqs.close() | ||||
					Loading…
					
					
				
		Reference in new issue