From cc4e3f8b0fcbdc754589977ce4846e4b0aaac4a3 Mon Sep 17 00:00:00 2001 From: Tpt Date: Sun, 3 Apr 2022 16:50:15 +0200 Subject: [PATCH] Bulk loader: avoids to write duplicated values --- lib/src/storage/mod.rs | 14 ++++++++++++-- lib/tests/store.rs | 7 ++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/lib/src/storage/mod.rs b/lib/src/storage/mod.rs index e6195197..fc13400e 100644 --- a/lib/src/storage/mod.rs +++ b/lib/src/storage/mod.rs @@ -1500,9 +1500,19 @@ impl FileBulkLoader { ) -> Result { let mut values = values.collect::>(); values.sort_unstable(); + let deduplicated_values = values.iter().enumerate().filter_map(|(i, value)| { + if values + .get(i + 1) + .map_or(true, |next_value| value != next_value) + { + Some(value) + } else { + None + } + }); let mut sst = self.storage.db.new_sst_file()?; - for t in values { - sst.insert_empty(&t)?; + for value in deduplicated_values { + sst.insert_empty(value)?; } sst.finish() } diff --git a/lib/tests/store.rs b/lib/tests/store.rs index 693a22c2..d1ee4e39 100644 --- a/lib/tests/store.rs +++ b/lib/tests/store.rs @@ -165,11 +165,12 @@ fn test_bulk_load_dataset() -> Result<(), Box> { store .bulk_loader() .load_dataset(Cursor::new(GRAPH_DATA), DatasetFormat::TriG, None)?; - for q in quads(NamedNodeRef::new_unchecked( - "http://www.wikidata.org/wiki/Special:EntityData/Q90", - )) { + let graph_name = + NamedNodeRef::new_unchecked("http://www.wikidata.org/wiki/Special:EntityData/Q90"); + for q in quads(graph_name) { assert!(store.contains(q)?); } + assert!(store.contains_named_graph(graph_name)?); store.validate()?; Ok(()) }