Bulk loader: avoids to write duplicated values

encoder
Tpt 3 years ago
parent 52ec334ab3
commit cc4e3f8b0f
  1. 14
      lib/src/storage/mod.rs
  2. 7
      lib/tests/store.rs

@ -1500,9 +1500,19 @@ impl FileBulkLoader {
) -> Result<PathBuf, StorageError> {
let mut values = values.collect::<Vec<_>>();
values.sort_unstable();
let deduplicated_values = values.iter().enumerate().filter_map(|(i, value)| {
if values
.get(i + 1)
.map_or(true, |next_value| value != next_value)
{
Some(value)
} else {
None
}
});
let mut sst = self.storage.db.new_sst_file()?;
for t in values {
sst.insert_empty(&t)?;
for value in deduplicated_values {
sst.insert_empty(value)?;
}
sst.finish()
}

@ -165,11 +165,12 @@ fn test_bulk_load_dataset() -> Result<(), Box<dyn Error>> {
store
.bulk_loader()
.load_dataset(Cursor::new(GRAPH_DATA), DatasetFormat::TriG, None)?;
for q in quads(NamedNodeRef::new_unchecked(
"http://www.wikidata.org/wiki/Special:EntityData/Q90",
)) {
let graph_name =
NamedNodeRef::new_unchecked("http://www.wikidata.org/wiki/Special:EntityData/Q90");
for q in quads(graph_name) {
assert!(store.contains(q)?);
}
assert!(store.contains_named_graph(graph_name)?);
store.validate()?;
Ok(())
}

Loading…
Cancel
Save