Wikibase: Allows to download structured data on Commons

Closes #24
pull/35/head
Tpt 5 years ago
parent cc76ae8298
commit fe3b32063b
  1. 56
      wikibase/src/loader.rs
  2. 24
      wikibase/src/main.rs

@ -1,5 +1,5 @@
use crate::SERVER; use crate::SERVER;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Datelike, Utc};
use oxigraph::model::NamedNode; use oxigraph::model::NamedNode;
use oxigraph::*; use oxigraph::*;
use reqwest::header::USER_AGENT; use reqwest::header::USER_AGENT;
@ -16,6 +16,7 @@ pub struct WikibaseLoader<R: Repository + Copy> {
entity_data_url: Url, entity_data_url: Url,
client: Client, client: Client,
namespaces: Vec<u32>, namespaces: Vec<u32>,
slot: Option<String>,
frequency: Duration, frequency: Duration,
start: DateTime<Utc>, start: DateTime<Utc>,
} }
@ -26,6 +27,7 @@ impl<R: Repository + Copy> WikibaseLoader<R> {
api_url: &str, api_url: &str,
pages_base_url: &str, pages_base_url: &str,
namespaces: &[u32], namespaces: &[u32],
slot: Option<&str>,
frequency: Duration, frequency: Duration,
) -> Result<Self> { ) -> Result<Self> {
Ok(Self { Ok(Self {
@ -35,16 +37,23 @@ impl<R: Repository + Copy> WikibaseLoader<R> {
.map_err(Error::wrap)?, .map_err(Error::wrap)?,
client: Client::new(), client: Client::new(),
namespaces: namespaces.to_vec(), namespaces: namespaces.to_vec(),
slot: slot.map(|t| t.to_owned()),
start: Utc::now(), start: Utc::now(),
frequency, frequency,
}) })
} }
pub fn initial_loading(&mut self) -> Result<()> { pub fn initial_loading(&mut self) -> Result<()> {
println!("Initial loading ");
self.start = Utc::now(); self.start = Utc::now();
if self.slot.is_some() {
println!("Skipping initial loading because a slot is required");
// No good initial loading
self.start = self.start.with_year(2018).unwrap();
return Ok(());
}
println!("Initial loading ");
for namespace in &self.namespaces { for namespace in &self.namespaces {
let mut parameters = HashMap::default(); let mut parameters = HashMap::default();
parameters.insert("action".to_owned(), "query".to_owned()); parameters.insert("action".to_owned(), "query".to_owned());
@ -104,14 +113,18 @@ impl<R: Repository + Copy> WikibaseLoader<R> {
let mut parameters = HashMap::default(); let mut parameters = HashMap::default();
parameters.insert("action".to_owned(), "query".to_owned()); parameters.insert("action".to_owned(), "query".to_owned());
parameters.insert("list".to_owned(), "recentchanges".to_owned()); parameters.insert("list".to_owned(), "recentchanges".to_owned());
parameters.insert( if let Some(slot) = &self.slot {
"rcnamespace".to_owned(), parameters.insert("rcslot".to_owned(), slot.to_owned());
self.namespaces } else {
.iter() parameters.insert(
.map(|ns| ns.to_string()) "rcnamespace".to_owned(),
.collect::<Vec<_>>() self.namespaces
.join("|"), .iter()
); .map(|ns| ns.to_string())
.collect::<Vec<_>>()
.join("|"),
);
}
parameters.insert("rcend".to_owned(), start.to_rfc2822()); parameters.insert("rcend".to_owned(), start.to_rfc2822());
parameters.insert("rcprop".to_owned(), "title|ids".to_owned()); parameters.insert("rcprop".to_owned(), "title|ids".to_owned());
parameters.insert("limit".to_owned(), "50".to_owned()); parameters.insert("limit".to_owned(), "50".to_owned());
@ -128,20 +141,21 @@ impl<R: Repository + Copy> WikibaseLoader<R> {
.unwrap() .unwrap()
{ {
let desc = change.as_object().unwrap(); let desc = change.as_object().unwrap();
let title = desc.get("title").unwrap().as_str().unwrap(); let id = if desc.get("ns").unwrap().as_u64().unwrap() == 6 {
// Hack for media info
let id = title.split(':').last().unwrap_or(title); format!("M{}", desc.get("pageid").unwrap().as_u64().unwrap())
if seen.contains(id) { } else {
let title = desc.get("title").unwrap().as_str().unwrap();
title.split(':').last().unwrap_or(title).to_owned()
};
if seen.contains(&id) {
continue; continue;
} }
seen.insert(id.to_owned()); seen.insert(id.clone());
match self.get_entity_data(id) { match self.get_entity_data(&id) {
Ok(data) => { Ok(data) => {
self.load_entity_data( self.load_entity_data(&format!("{}/{}", self.entity_data_url, id), data)?;
&(self.entity_data_url.to_string() + "/" + id),
data,
)?;
} }
Err(e) => eprintln!("Error while retrieving data for entity {}: {}", id, e), Err(e) => eprintln!("Error while retrieving data for entity {}: {}", id, e),
} }

@ -65,8 +65,15 @@ pub fn main() {
.arg( .arg(
Arg::with_name("namespaces") Arg::with_name("namespaces")
.long("namespaces") .long("namespaces")
.help("Namespaces ids, to load in Blazegraph like \"0,120\"") .help("Namespaces ids to load like \"0,120\"")
.required(true) .required(false)
.takes_value(true),
)
.arg(
Arg::with_name("slot")
.long("slot")
.help("Slot to load like \"mediainfo\". Could not be use with namespaces")
.required(false)
.takes_value(true), .takes_value(true),
) )
.get_matches(); .get_matches();
@ -91,16 +98,25 @@ where
let mediawiki_base_url = matches.value_of("mediawiki_base_url").unwrap().to_owned(); let mediawiki_base_url = matches.value_of("mediawiki_base_url").unwrap().to_owned();
let namespaces = matches let namespaces = matches
.value_of("namespaces") .value_of("namespaces")
.unwrap() .unwrap_or("")
.split(',') .split(',')
.map(|t| u32::from_str(t.trim()).unwrap()) .flat_map(|t| {
let t = t.trim();
if t.is_empty() {
None
} else {
Some(u32::from_str(t).unwrap())
}
})
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let slot = matches.value_of("slot").map(|t| t.to_owned());
thread::spawn(move || { thread::spawn(move || {
let mut loader = WikibaseLoader::new( let mut loader = WikibaseLoader::new(
repo.as_ref(), repo.as_ref(),
&mediawiki_api, &mediawiki_api,
&mediawiki_base_url, &mediawiki_base_url,
&namespaces, &namespaces,
slot.as_deref(),
Duration::new(10, 0), Duration::new(10, 0),
) )
.unwrap(); .unwrap();

Loading…
Cancel
Save