RosettaCodeData/Task/Inverted-index/Rust/inverted-index.rust

242 lines
6.4 KiB
Plaintext

// Part 1: Inverted index structure
use std::{
borrow::Borrow,
collections::{BTreeMap, BTreeSet},
};
#[derive(Debug, Default)]
pub struct InvertedIndex<T> {
indexed: BTreeMap<String, BTreeSet<usize>>,
sources: Vec<T>,
}
impl<T> InvertedIndex<T> {
pub fn add<I, V>(&mut self, source: T, tokens: I)
where
I: IntoIterator<Item = V>,
V: Into<String>,
{
let source_id = self.sources.len();
self.sources.push(source);
for token in tokens {
self.indexed
.entry(token.into())
.or_insert_with(BTreeSet::new)
.insert(source_id);
}
}
pub fn search<'a, I, K>(&self, tokens: I) -> impl Iterator<Item = &T>
where
String: Borrow<K>,
K: Ord + ?Sized + 'a,
I: IntoIterator<Item = &'a K>,
{
let mut tokens = tokens.into_iter();
tokens
.next()
.and_then(|token| self.indexed.get(token).cloned())
.and_then(|first| {
tokens.try_fold(first, |found, token| {
self.indexed
.get(token)
.map(|sources| {
found
.intersection(sources)
.cloned()
.collect::<BTreeSet<_>>()
})
.filter(|update| !update.is_empty())
})
})
.unwrap_or_else(BTreeSet::new)
.into_iter()
.map(move |source| &self.sources[source])
}
pub fn tokens(&self) -> impl Iterator<Item = &str> {
self.indexed.keys().map(|it| it.as_str())
}
pub fn sources(&self) -> &[T] {
&self.sources
}
}
// Part 2: File walking and processing
use std::{
ffi::OsString,
fmt::{Debug, Display},
fs::{read_dir, DirEntry, File, ReadDir},
io::{self, stdin, Read},
path::{Path, PathBuf},
};
#[derive(Debug)]
pub struct Files {
dirs: Vec<ReadDir>,
}
impl Files {
pub fn walk<P: AsRef<Path>>(path: P) -> io::Result<Self> {
Ok(Files {
dirs: vec![read_dir(path)?],
})
}
}
impl Iterator for Files {
type Item = DirEntry;
fn next(&mut self) -> Option<Self::Item> {
'outer: while let Some(mut current) = self.dirs.pop() {
while let Some(entry) = current.next() {
if let Ok(entry) = entry {
let path = entry.path();
if !path.is_dir() {
self.dirs.push(current);
return Some(entry);
} else if let Ok(dir) = read_dir(path) {
self.dirs.push(current);
self.dirs.push(dir);
continue 'outer;
}
}
}
}
None // No directory left
}
}
fn tokenize<'a>(input: &'a str) -> impl Iterator<Item = String> + 'a {
input
.split(|c: char| !c.is_alphanumeric())
.filter(|token| !token.is_empty())
.map(|token| token.to_lowercase())
}
fn tokenize_file<P: AsRef<Path>>(path: P) -> io::Result<BTreeSet<String>> {
let mut buffer = Vec::new();
File::open(path)?.read_to_end(&mut buffer)?;
let text = String::from_utf8_lossy(&buffer);
Ok(tokenize(&text).collect::<BTreeSet<_>>())
}
fn tokenize_query(input: &str) -> Vec<String> {
let result = tokenize(input).collect::<BTreeSet<_>>();
// Make a vector sorted by length, so that longer tokens are processed first.
// This heuristics should narrow the resulting set faster.
let mut result = result.into_iter().collect::<Vec<_>>();
result.sort_by_key(|item| usize::MAX - item.len());
result
}
// Part 3: Interactive application
fn args() -> io::Result<(OsString, BTreeSet<OsString>)> {
let mut args = std::env::args_os().skip(1); // Skip the executable's name
let path = args
.next()
.ok_or_else(|| io::Error::new(io::ErrorKind::Other, "missing path"))?;
let extensions = args.collect::<BTreeSet<_>>();
Ok((path, extensions))
}
fn print_hits<'a, T>(hits: impl Iterator<Item = T>)
where
T: Display,
{
let mut found_none = true;
for (number, hit) in hits.enumerate() {
println!(" [{}] {}", number + 1, hit);
found_none = false;
}
if found_none {
println!("(none)")
}
}
fn main() -> io::Result<()> {
let (path, extensions) = args()?;
let mut files = InvertedIndex::<PathBuf>::default();
let mut content = InvertedIndex::<PathBuf>::default();
println!(
"Indexing {:?} files in '{}'",
extensions,
path.to_string_lossy()
);
for path in Files::walk(path)?.map(|file| file.path()).filter(|path| {
path.extension()
.filter(|&ext| extensions.is_empty() || extensions.contains(ext))
.is_some()
}) {
files.add(path.clone(), tokenize(&path.to_string_lossy()));
match tokenize_file(&path) {
Ok(tokens) => content.add(path, tokens),
Err(e) => eprintln!("Skipping a file {}: {}", path.display(), e),
}
}
println!(
"Indexed {} tokens in {} files.",
content.tokens().count(),
content.sources.len()
);
// Run the query UI loop
let mut query = String::new();
loop {
query.clear();
println!("Enter search query:");
if stdin().read_line(&mut query).is_err() || query.trim().is_empty() {
break;
}
match query.trim() {
"/exit" | "/quit" | "" => break,
"/tokens" => {
println!("Tokens:");
for token in content.tokens() {
println!("{}", token);
}
println!();
}
"/files" => {
println!("Sources:");
for source in content.sources() {
println!("{}", source.display());
}
println!();
}
_ => {
let query = tokenize_query(&query);
println!();
println!("Found hits:");
print_hits(content.search(query.iter()).map(|it| it.display()));
println!("Found file names:");
print_hits(files.search(query.iter()).map(|it| it.display()));
println!();
}
}
}
Ok(())
}