242 lines
6.4 KiB
Plaintext
242 lines
6.4 KiB
Plaintext
// Part 1: Inverted index structure
|
|
|
|
use std::{
|
|
borrow::Borrow,
|
|
collections::{BTreeMap, BTreeSet},
|
|
};
|
|
|
|
#[derive(Debug, Default)]
|
|
pub struct InvertedIndex<T> {
|
|
indexed: BTreeMap<String, BTreeSet<usize>>,
|
|
sources: Vec<T>,
|
|
}
|
|
|
|
impl<T> InvertedIndex<T> {
|
|
pub fn add<I, V>(&mut self, source: T, tokens: I)
|
|
where
|
|
I: IntoIterator<Item = V>,
|
|
V: Into<String>,
|
|
{
|
|
let source_id = self.sources.len();
|
|
self.sources.push(source);
|
|
for token in tokens {
|
|
self.indexed
|
|
.entry(token.into())
|
|
.or_insert_with(BTreeSet::new)
|
|
.insert(source_id);
|
|
}
|
|
}
|
|
|
|
pub fn search<'a, I, K>(&self, tokens: I) -> impl Iterator<Item = &T>
|
|
where
|
|
String: Borrow<K>,
|
|
K: Ord + ?Sized + 'a,
|
|
I: IntoIterator<Item = &'a K>,
|
|
{
|
|
let mut tokens = tokens.into_iter();
|
|
|
|
tokens
|
|
.next()
|
|
.and_then(|token| self.indexed.get(token).cloned())
|
|
.and_then(|first| {
|
|
tokens.try_fold(first, |found, token| {
|
|
self.indexed
|
|
.get(token)
|
|
.map(|sources| {
|
|
found
|
|
.intersection(sources)
|
|
.cloned()
|
|
.collect::<BTreeSet<_>>()
|
|
})
|
|
.filter(|update| !update.is_empty())
|
|
})
|
|
})
|
|
.unwrap_or_else(BTreeSet::new)
|
|
.into_iter()
|
|
.map(move |source| &self.sources[source])
|
|
}
|
|
|
|
pub fn tokens(&self) -> impl Iterator<Item = &str> {
|
|
self.indexed.keys().map(|it| it.as_str())
|
|
}
|
|
|
|
pub fn sources(&self) -> &[T] {
|
|
&self.sources
|
|
}
|
|
}
|
|
|
|
// Part 2: File walking and processing
|
|
|
|
use std::{
|
|
ffi::OsString,
|
|
fmt::{Debug, Display},
|
|
fs::{read_dir, DirEntry, File, ReadDir},
|
|
io::{self, stdin, Read},
|
|
path::{Path, PathBuf},
|
|
};
|
|
|
|
#[derive(Debug)]
|
|
pub struct Files {
|
|
dirs: Vec<ReadDir>,
|
|
}
|
|
|
|
impl Files {
|
|
pub fn walk<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
|
Ok(Files {
|
|
dirs: vec![read_dir(path)?],
|
|
})
|
|
}
|
|
}
|
|
|
|
impl Iterator for Files {
|
|
type Item = DirEntry;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
'outer: while let Some(mut current) = self.dirs.pop() {
|
|
while let Some(entry) = current.next() {
|
|
if let Ok(entry) = entry {
|
|
let path = entry.path();
|
|
if !path.is_dir() {
|
|
self.dirs.push(current);
|
|
return Some(entry);
|
|
} else if let Ok(dir) = read_dir(path) {
|
|
self.dirs.push(current);
|
|
self.dirs.push(dir);
|
|
continue 'outer;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
None // No directory left
|
|
}
|
|
}
|
|
|
|
fn tokenize<'a>(input: &'a str) -> impl Iterator<Item = String> + 'a {
|
|
input
|
|
.split(|c: char| !c.is_alphanumeric())
|
|
.filter(|token| !token.is_empty())
|
|
.map(|token| token.to_lowercase())
|
|
}
|
|
|
|
fn tokenize_file<P: AsRef<Path>>(path: P) -> io::Result<BTreeSet<String>> {
|
|
let mut buffer = Vec::new();
|
|
File::open(path)?.read_to_end(&mut buffer)?;
|
|
let text = String::from_utf8_lossy(&buffer);
|
|
Ok(tokenize(&text).collect::<BTreeSet<_>>())
|
|
}
|
|
|
|
fn tokenize_query(input: &str) -> Vec<String> {
|
|
let result = tokenize(input).collect::<BTreeSet<_>>();
|
|
// Make a vector sorted by length, so that longer tokens are processed first.
|
|
// This heuristics should narrow the resulting set faster.
|
|
let mut result = result.into_iter().collect::<Vec<_>>();
|
|
result.sort_by_key(|item| usize::MAX - item.len());
|
|
result
|
|
}
|
|
|
|
// Part 3: Interactive application
|
|
|
|
fn args() -> io::Result<(OsString, BTreeSet<OsString>)> {
|
|
let mut args = std::env::args_os().skip(1); // Skip the executable's name
|
|
|
|
let path = args
|
|
.next()
|
|
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, "missing path"))?;
|
|
|
|
let extensions = args.collect::<BTreeSet<_>>();
|
|
|
|
Ok((path, extensions))
|
|
}
|
|
|
|
fn print_hits<'a, T>(hits: impl Iterator<Item = T>)
|
|
where
|
|
T: Display,
|
|
{
|
|
let mut found_none = true;
|
|
for (number, hit) in hits.enumerate() {
|
|
println!(" [{}] {}", number + 1, hit);
|
|
found_none = false;
|
|
}
|
|
|
|
if found_none {
|
|
println!("(none)")
|
|
}
|
|
}
|
|
|
|
fn main() -> io::Result<()> {
|
|
let (path, extensions) = args()?;
|
|
let mut files = InvertedIndex::<PathBuf>::default();
|
|
let mut content = InvertedIndex::<PathBuf>::default();
|
|
|
|
println!(
|
|
"Indexing {:?} files in '{}'",
|
|
extensions,
|
|
path.to_string_lossy()
|
|
);
|
|
|
|
for path in Files::walk(path)?.map(|file| file.path()).filter(|path| {
|
|
path.extension()
|
|
.filter(|&ext| extensions.is_empty() || extensions.contains(ext))
|
|
.is_some()
|
|
}) {
|
|
files.add(path.clone(), tokenize(&path.to_string_lossy()));
|
|
|
|
match tokenize_file(&path) {
|
|
Ok(tokens) => content.add(path, tokens),
|
|
Err(e) => eprintln!("Skipping a file {}: {}", path.display(), e),
|
|
}
|
|
}
|
|
|
|
println!(
|
|
"Indexed {} tokens in {} files.",
|
|
content.tokens().count(),
|
|
content.sources.len()
|
|
);
|
|
|
|
// Run the query UI loop
|
|
let mut query = String::new();
|
|
|
|
loop {
|
|
query.clear();
|
|
println!("Enter search query:");
|
|
if stdin().read_line(&mut query).is_err() || query.trim().is_empty() {
|
|
break;
|
|
}
|
|
|
|
match query.trim() {
|
|
"/exit" | "/quit" | "" => break,
|
|
|
|
"/tokens" => {
|
|
println!("Tokens:");
|
|
for token in content.tokens() {
|
|
println!("{}", token);
|
|
}
|
|
|
|
println!();
|
|
}
|
|
|
|
"/files" => {
|
|
println!("Sources:");
|
|
for source in content.sources() {
|
|
println!("{}", source.display());
|
|
}
|
|
|
|
println!();
|
|
}
|
|
|
|
_ => {
|
|
let query = tokenize_query(&query);
|
|
println!();
|
|
println!("Found hits:");
|
|
print_hits(content.search(query.iter()).map(|it| it.display()));
|
|
println!("Found file names:");
|
|
print_hits(files.search(query.iter()).map(|it| it.display()));
|
|
println!();
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|