Makes bulk loader multithreaded

pull/173/head
Tpt 3 years ago
parent f262df9f53
commit e99d6f4ad0
  1. 45
      lib/src/storage/mod.rs
  2. 9
      lib/src/store.rs

@ -18,6 +18,7 @@ use std::mem::take;
#[cfg(not(target_arch = "wasm32"))] #[cfg(not(target_arch = "wasm32"))]
use std::path::Path; use std::path::Path;
use std::path::PathBuf; use std::path::PathBuf;
use std::thread::spawn;
mod backend; mod backend;
mod binary_encoder; mod binary_encoder;
@ -37,7 +38,7 @@ const DPOS_CF: &str = "dpos";
const DOSP_CF: &str = "dosp"; const DOSP_CF: &str = "dosp";
const GRAPHS_CF: &str = "graphs"; const GRAPHS_CF: &str = "graphs";
const DEFAULT_CF: &str = "default"; const DEFAULT_CF: &str = "default";
const AUTO_WRITE_BATCH_THRESHOLD: usize = 1024 * 1024; const BULK_LOAD_BATCH_SIZE: usize = 1024 * 1024;
/// Low level storage primitives /// Low level storage primitives
#[derive(Clone)] #[derive(Clone)]
@ -154,7 +155,7 @@ impl Storage {
if !quad.graph_name.is_default_graph() { if !quad.graph_name.is_default_graph() {
transaction.insert_empty(&this.graphs_cf, &encode_term(&quad.graph_name))?; transaction.insert_empty(&this.graphs_cf, &encode_term(&quad.graph_name))?;
size += 1; size += 1;
if size % AUTO_WRITE_BATCH_THRESHOLD == 0 { if size % BULK_LOAD_BATCH_SIZE == 0 {
let mut tr = this.db.transaction(); let mut tr = this.db.transaction();
swap(&mut transaction, &mut tr); swap(&mut transaction, &mut tr);
tr.commit()?; tr.commit()?;
@ -955,8 +956,29 @@ impl StorageWriter {
/// Creates a database from a dataset files. /// Creates a database from a dataset files.
#[cfg(not(target_arch = "wasm32"))] #[cfg(not(target_arch = "wasm32"))]
pub struct BulkLoader<'a> { pub fn bulk_load(storage: &Storage, quads: impl IntoIterator<Item = Result<Quad>>) -> Result<()> {
storage: &'a Storage, let mut threads = Vec::new();
let mut buffer = Vec::with_capacity(BULK_LOAD_BATCH_SIZE);
for quad in quads {
let quad = quad?;
buffer.push(quad);
if buffer.len() >= BULK_LOAD_BATCH_SIZE {
let buffer = take(&mut buffer);
let storage = storage.clone();
threads.push(spawn(move || BulkLoader::new(storage).load(buffer)));
}
}
BulkLoader::new(storage.clone()).load(buffer)?; // Last buffer
for thread in threads {
thread.join().unwrap()?;
}
Ok(())
}
/// Creates a database from a dataset files.
#[cfg(not(target_arch = "wasm32"))]
struct BulkLoader {
storage: Storage,
id2str: HashMap<StrHash, Box<str>>, id2str: HashMap<StrHash, Box<str>>,
quads: HashSet<EncodedQuad>, quads: HashSet<EncodedQuad>,
triples: HashSet<EncodedQuad>, triples: HashSet<EncodedQuad>,
@ -965,8 +987,8 @@ pub struct BulkLoader<'a> {
} }
#[cfg(not(target_arch = "wasm32"))] #[cfg(not(target_arch = "wasm32"))]
impl<'a> BulkLoader<'a> { impl BulkLoader {
pub fn new(storage: &'a Storage) -> Self { fn new(storage: Storage) -> Self {
Self { Self {
storage, storage,
id2str: HashMap::default(), id2str: HashMap::default(),
@ -977,10 +999,8 @@ impl<'a> BulkLoader<'a> {
} }
} }
pub fn load(&mut self, quads: impl IntoIterator<Item = Result<Quad>>) -> Result<()> { fn load(&mut self, quads: impl IntoIterator<Item = Quad>) -> Result<()> {
let mut count = 0;
for quad in quads { for quad in quads {
let quad = quad?;
let encoded = EncodedQuad::from(quad.as_ref()); let encoded = EncodedQuad::from(quad.as_ref());
self.buffer.clear(); self.buffer.clear();
if quad.graph_name.is_default_graph() { if quad.graph_name.is_default_graph() {
@ -1011,13 +1031,8 @@ impl<'a> BulkLoader<'a> {
} }
} }
} }
count += 1;
if count % (1024 * 1024) == 0 {
self.save()?;
}
} }
self.save()?; self.save()
self.storage.compact()
} }
fn save(&mut self) -> Result<()> { fn save(&mut self) -> Result<()> {

@ -30,10 +30,10 @@ use crate::sparql::{
evaluate_query, evaluate_update, EvaluationError, Query, QueryOptions, QueryResults, Update, evaluate_query, evaluate_update, EvaluationError, Query, QueryOptions, QueryResults, Update,
UpdateOptions, UpdateOptions,
}; };
#[cfg(not(target_arch = "wasm32"))]
use crate::storage::bulk_load;
use crate::storage::io::{dump_dataset, dump_graph, load_dataset, load_graph}; use crate::storage::io::{dump_dataset, dump_graph, load_dataset, load_graph};
use crate::storage::numeric_encoder::{Decoder, EncodedQuad, EncodedTerm}; use crate::storage::numeric_encoder::{Decoder, EncodedQuad, EncodedTerm};
#[cfg(not(target_arch = "wasm32"))]
use crate::storage::BulkLoader;
use crate::storage::{ChainedDecodingQuadIterator, DecodingGraphIterator, Storage, StorageReader}; use crate::storage::{ChainedDecodingQuadIterator, DecodingGraphIterator, Storage, StorageReader};
use std::io::{BufRead, Write}; use std::io::{BufRead, Write};
#[cfg(not(target_arch = "wasm32"))] #[cfg(not(target_arch = "wasm32"))]
@ -618,7 +618,7 @@ impl Store {
.with_base_iri(base_iri) .with_base_iri(base_iri)
.map_err(invalid_input_error)?; .map_err(invalid_input_error)?;
} }
BulkLoader::new(&self.storage).load(parser.read_quads(reader)?) bulk_load(&self.storage, parser.read_quads(reader)?)
} }
/// Loads a dataset file efficiently into the store. /// Loads a dataset file efficiently into the store.
@ -662,7 +662,8 @@ impl Store {
.map_err(invalid_input_error)?; .map_err(invalid_input_error)?;
} }
let to_graph_name = to_graph_name.into(); let to_graph_name = to_graph_name.into();
BulkLoader::new(&self.storage).load( bulk_load(
&self.storage,
parser parser
.read_triples(reader)? .read_triples(reader)?
.map(|r| Ok(r?.in_graph(to_graph_name.into_owned()))), .map(|r| Ok(r?.in_graph(to_graph_name.into_owned()))),

Loading…
Cancel
Save