RosettaCodeData/Task/Inverted-index/Go/inverted-index.go

136 lines
3.1 KiB
Go

package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"os"
)
// inverted index representation
var index map[string][]int // ints index into indexed
var indexed []doc
type doc struct {
file string
title string
}
func main() {
// initialize representation
index = make(map[string][]int)
// build index
if err := indexDir("docs"); err != nil {
fmt.Println(err)
return
}
// run user interface
ui()
}
func indexDir(dir string) error {
df, err := os.Open(dir)
if err != nil {
return err
}
fis, err := df.Readdir(-1)
if err != nil {
return err
}
if len(fis) == 0 {
return errors.New(fmt.Sprintf("no files in %s", dir))
}
indexed := 0
for _, fi := range fis {
if !fi.IsDir() {
if indexFile(dir + "/" + fi.Name()) {
indexed++
}
}
}
return nil
}
func indexFile(fn string) bool {
f, err := os.Open(fn)
if err != nil {
fmt.Println(err)
return false // only false return
}
// register new file
x := len(indexed)
indexed = append(indexed, doc{fn, fn})
pdoc := &indexed[x]
// scan lines
r := bufio.NewReader(f)
lines := 0
for {
b, isPrefix, err := r.ReadLine()
switch {
case err == io.EOF:
return true
case err != nil:
fmt.Println(err)
return true
case isPrefix:
fmt.Printf("%s: unexpected long line\n", fn)
return true
case lines < 20 && bytes.HasPrefix(b, []byte("Title:")):
// in a real program you would write code
// to skip the Gutenberg document header
// and not index it.
pdoc.title = string(b[7:])
}
// index line of text in b
// again, in a real program you would write a much
// nicer word splitter.
wordLoop:
for _, bword := range bytes.Fields(b) {
bword := bytes.Trim(bword, ".,-~?!\"'`;:()<>[]{}\\|/=_+*&^%$#@")
if len(bword) > 0 {
word := string(bword)
dl := index[word]
for _, d := range dl {
if d == x {
continue wordLoop
}
}
index[word] = append(dl, x)
}
}
}
return true
}
func ui() {
fmt.Println(len(index), "words indexed in", len(indexed), "files")
fmt.Println("enter single words to search for")
fmt.Println("enter a blank line when done")
var word string
for {
fmt.Print("search word: ")
wc, _ := fmt.Scanln(&word)
if wc == 0 {
return
}
switch dl := index[word]; len(dl) {
case 0:
fmt.Println("no match")
case 1:
fmt.Println("one match:")
fmt.Println(" ", indexed[dl[0]].file, indexed[dl[0]].title)
default:
fmt.Println(len(dl), "matches:")
for _, d := range dl {
fmt.Println(" ", indexed[d].file, indexed[d].title)
}
}
}
}