Bulk loader: avoids to write duplicated values

encoder
Tpt 3 years ago
parent 52ec334ab3
commit cc4e3f8b0f
  1. 14
      lib/src/storage/mod.rs
  2. 7
      lib/tests/store.rs

@ -1500,9 +1500,19 @@ impl FileBulkLoader {
) -> Result<PathBuf, StorageError> { ) -> Result<PathBuf, StorageError> {
let mut values = values.collect::<Vec<_>>(); let mut values = values.collect::<Vec<_>>();
values.sort_unstable(); values.sort_unstable();
let deduplicated_values = values.iter().enumerate().filter_map(|(i, value)| {
if values
.get(i + 1)
.map_or(true, |next_value| value != next_value)
{
Some(value)
} else {
None
}
});
let mut sst = self.storage.db.new_sst_file()?; let mut sst = self.storage.db.new_sst_file()?;
for t in values { for value in deduplicated_values {
sst.insert_empty(&t)?; sst.insert_empty(value)?;
} }
sst.finish() sst.finish()
} }

@ -165,11 +165,12 @@ fn test_bulk_load_dataset() -> Result<(), Box<dyn Error>> {
store store
.bulk_loader() .bulk_loader()
.load_dataset(Cursor::new(GRAPH_DATA), DatasetFormat::TriG, None)?; .load_dataset(Cursor::new(GRAPH_DATA), DatasetFormat::TriG, None)?;
for q in quads(NamedNodeRef::new_unchecked( let graph_name =
"http://www.wikidata.org/wiki/Special:EntityData/Q90", NamedNodeRef::new_unchecked("http://www.wikidata.org/wiki/Special:EntityData/Q90");
)) { for q in quads(graph_name) {
assert!(store.contains(q)?); assert!(store.contains(q)?);
} }
assert!(store.contains_named_graph(graph_name)?);
store.validate()?; store.validate()?;
Ok(()) Ok(())
} }

Loading…
Cancel
Save