53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
from collections import Counter
|
|
|
|
|
|
def termsearch(terms): # Searches full inverted index
|
|
if not set(terms).issubset(words):
|
|
return set()
|
|
return reduce(set.intersection,
|
|
(set(x[0] for x in txtindx)
|
|
for term, txtindx in finvindex.items()
|
|
if term in terms),
|
|
set(texts.keys()) )
|
|
|
|
def phrasesearch(phrase):
|
|
wordsinphrase = phrase.strip().strip('"').split()
|
|
if not set(wordsinphrase).issubset(words):
|
|
return set()
|
|
#firstword, *otherwords = wordsinphrase # Only Python 3
|
|
firstword, otherwords = wordsinphrase[0], wordsinphrase[1:]
|
|
found = []
|
|
for txt in termsearch(wordsinphrase):
|
|
# Possible text files
|
|
for firstindx in (indx for t,indx in finvindex[firstword]
|
|
if t == txt):
|
|
# Over all positions of the first word of the phrase in this txt
|
|
if all( (txt, firstindx+1 + otherindx) in finvindex[otherword]
|
|
for otherindx, otherword in enumerate(otherwords) ):
|
|
found.append(txt)
|
|
return found
|
|
|
|
|
|
finvindex = {word:set((txt, wrdindx)
|
|
for txt, wrds in texts.items()
|
|
for wrdindx in (i for i,w in enumerate(wrds) if word==w)
|
|
if word in wrds)
|
|
for word in words}
|
|
print('\nFull Inverted Index')
|
|
pp({k:sorted(v) for k,v in finvindex.items()})
|
|
|
|
print('\nTerm Search on full inverted index for: ' + repr(terms))
|
|
pp(sorted(termsearch(terms)))
|
|
|
|
phrase = '"what is it"'
|
|
print('\nPhrase Search for: ' + phrase)
|
|
print(phrasesearch(phrase))
|
|
|
|
# Show multiple match capability
|
|
phrase = '"it is"'
|
|
print('\nPhrase Search for: ' + phrase)
|
|
ans = phrasesearch(phrase)
|
|
print(ans)
|
|
ans = Counter(ans)
|
|
print(' The phrase is found most commonly in text: ' + repr(ans.most_common(1)[0][0]))
|