Skip to content
This repository has been archived by the owner on Aug 14, 2021. It is now read-only.

Commit

Permalink
Multifield FST is generated
Browse files Browse the repository at this point in the history
  • Loading branch information
emmanuel-keller committed Nov 14, 2017
1 parent 747588b commit 762302c
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 5 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
/target/
**/*.rs.bk
Cargo.lock
.idea/
/test/
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ version = "0.1.0"
authors = ["Emmanuel Keller <[email protected]>"]

[dependencies]
fst = "0.2"
16 changes: 16 additions & 0 deletions src/document.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
pub mod document {
use terms::terms::Terms;
use std::collections::HashMap;

pub struct Document { pub fields: HashMap<String, Terms> }

impl Document {
pub fn new() -> Document {
return Document { fields: HashMap::new() };
}

pub fn field(&mut self, field_name: &str) -> &mut Terms {
return self.fields.entry(field_name.to_string()).or_insert(Terms::new());
}
}
}
5 changes: 5 additions & 0 deletions src/field.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pub mod field {
pub struct Field {
name: String
}
}
64 changes: 64 additions & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
pub mod index {
use std::collections::HashMap;
use segment::segment::Segment;
use terms::terms::Terms;
use document::document::Document;
use std::io;
use std::path::Path;
use std::path::PathBuf;
use std::fs;
use std::fs::File;
use fst::{IntoStreamer, Streamer, Map, MapBuilder, Result};

pub struct Index {
pub path: String,
segments: HashMap<String, Segment>
}

impl Index {
/// Open an existing index, or create a new one.
pub fn new(index_path: &str) -> io::Result<Index> {
let p = Path::new(index_path);
if !p.exists() {
fs::create_dir(p)?
}
// Read the existing segments
let mut segments = HashMap::new();
for entry in fs::read_dir(p)? {
let dir_entry = entry?;
if dir_entry.file_type()?.is_dir() {
let dir_name = dir_entry.file_name().into_string().unwrap();
segments.insert(dir_name.to_string(), Segment::new(dir_name.as_ref())?);
}
}
return Ok(Index { path: index_path.to_string(), segments: segments });
}

pub fn insert(&self, documents: Vec<Document>) -> Result<()> {
//TODO get next segment number
for document in documents { self.insert_document(document); }
return Ok({});
}

fn insert_document(&self, document: Document) -> Result<()> {
for (field, terms) in document.fields {
self.insert_field(1, field.as_ref(), terms)?;
}
return Ok({});
}

fn insert_field(&self, segment_number: u64, field: &str, terms: Terms) -> Result<()> {
let field_fst = field.to_string() + ".fst";
let field_path: PathBuf = [&self.path, &field_fst].iter().collect();
let mut wtr = io::BufWriter::new(try!(File::create(field_path)));
let mut build = try!(MapBuilder::new(wtr));
let mut pos = 0;
for (term, positions) in terms.term_positions {
build.insert(term, pos)?;
pos = pos + 1;
}
try!(build.finish());
return Ok({});
}
}
}
18 changes: 13 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
extern crate fst;

use std::collections::HashMap;

pub mod field;
pub mod segment;
pub mod index;
pub mod terms;
pub mod document;

use index::index::Index;

#[cfg(test)]
mod tests {
#[test]
fn it_works() {
}
}
mod tests;
17 changes: 17 additions & 0 deletions src/segment.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
pub mod segment {
use std::collections::HashMap;
use std::io;
use field::field::Field;

pub struct Segment {
name: String,
fields: HashMap<String, Field>
}

impl Segment {
pub fn new(name: &str) -> io::Result<Segment> {
//TODO load fields
return Ok(Segment { name: name.to_string(), fields: HashMap::new() });
}
}
}
18 changes: 18 additions & 0 deletions src/terms.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
pub mod terms {
use std::collections::BTreeMap;

pub struct Terms {
pub term_positions: BTreeMap<String, Vec<i32>>
}

impl Terms {
pub fn new() -> Terms {
return Terms { term_positions: BTreeMap::new() };
}

pub fn term(&mut self, term: &str, position: i32) -> &mut Terms {
self.term_positions.entry(term.to_string()).or_insert(Vec::new()).push(position);
return self;
}
}
}
32 changes: 32 additions & 0 deletions src/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use index::index::Index;
use document::document::Document;

#[test]
fn create_index() {
let index = Index::new("target/test").unwrap();
assert_eq!(index.path, "target/test");

let mut documents = Vec::new();

let mut document1 = Document::new();
document1.field("id").term("id1", 0);
document1.field("title").term("my", 0).term("title", 1);
documents.push(document1);

let mut document2 = Document::new();
document2.field("id").term("id1", 1);
document2.field("title").term("my", 0).term("second", 1).term("title", 2).term("titles", 2);
documents.push(document2);

assert!(index.insert(documents).is_ok());
}

#[test]
fn fail_on_create_index_sub_directory() {
let result = Index::new("target/test/test/test");
assert!(result.is_err());
}
}

0 comments on commit 762302c

Please sign in to comment.