diff --git a/Cargo.lock b/Cargo.lock index 9aa8ceb0f2..672036fdfe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6392,6 +6392,7 @@ dependencies = [ "bytemuck", "bzip2 0.4.4", "chrono", + "crossbeam-channel", "csv", "dashmap", "datafusion-expr-common", @@ -6638,6 +6639,7 @@ dependencies = [ "raphtory", "raphtory-auth-noop", "raphtory-graphql", + "tikv-jemallocator", ] [[package]] diff --git a/db4-storage/src/pages/mod.rs b/db4-storage/src/pages/mod.rs index dcc1c07b6f..682865ab51 100644 --- a/db4-storage/src/pages/mod.rs +++ b/db4-storage/src/pages/mod.rs @@ -176,7 +176,7 @@ impl< node_meta.get_or_create_node_type_id(node_type); } - let t_len = edge_storage.t_len(); + let t_len = edge_storage.t_len() + node_storage.t_len(); Ok(Self { nodes: node_storage, diff --git a/db4-storage/src/pages/node_store.rs b/db4-storage/src/pages/node_store.rs index b28420d1b3..89413ef277 100644 --- a/db4-storage/src/pages/node_store.rs +++ b/db4-storage/src/pages/node_store.rs @@ -170,6 +170,10 @@ impl, EXT: PersistenceStrategy> self.segments.count() } + pub fn t_len(&self) -> usize { + self.segments.iter().map(|(_, page)| page.t_len()).sum() + } + // pub fn segments(&self) -> &boxcar::Vec> { // &self.segments // } diff --git a/python/Cargo.toml b/python/Cargo.toml index 2744f7afdc..3fcef1949a 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -39,5 +39,5 @@ proto = ["raphtory/proto"] [build-dependencies] pyo3-build-config = { workspace = true } -#[target.'cfg(not(target_env = "msvc"))'.dependencies] -#tikv-jemallocator.workspace = true +[target.'cfg(target_os = "macos")'.dependencies] +tikv-jemallocator.workspace = true diff --git a/python/src/lib.rs b/python/src/lib.rs index b1ca6c95d9..564f3822a9 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -10,6 +10,12 @@ use raphtory::python::{ }; use raphtory_graphql::python::pymodule::base_graphql_module; +#[cfg(target_os = "macos")] +use tikv_jemallocator::Jemalloc; +#[cfg(target_os = "macos")] +#[global_allocator] +static GLOBAL: Jemalloc = Jemalloc; + /// Raphtory graph analytics library #[pymodule] fn _raphtory(py: Python<'_>, m: &Bound) -> PyResult<()> { diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index 3f54ff24ed..d6cc250754 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -23,6 +23,7 @@ storage.workspace = true iter-enum = { workspace = true, features = ["rayon"] } hashbrown = { workspace = true } chrono = { workspace = true } +crossbeam-channel = { workspace = true } itertools = { workspace = true } num-traits = { workspace = true } num-integer = { workspace = true } diff --git a/raphtory/examples/eth_loader.rs b/raphtory/examples/eth_loader.rs index 9e2fdd2afd..e0fbb2f0ed 100644 --- a/raphtory/examples/eth_loader.rs +++ b/raphtory/examples/eth_loader.rs @@ -1,8 +1,6 @@ #[cfg(feature = "io")] -use raphtory::io::{ - arrow::df_loaders::edges::ColumnNames, parquet_loaders::load_edges_from_parquet, -}; -use raphtory::{errors::GraphError, prelude::*}; +use raphtory::io::parquet_loaders::load_edges_from_parquet; +use raphtory::{arrow_loader::df_loaders::edges::ColumnNames, errors::GraphError, prelude::*}; use std::path::{Path, PathBuf}; /// Load ETH data from Parquet files into a Raphtory Graph. diff --git a/raphtory/examples/snb_loader.rs b/raphtory/examples/snb_loader.rs index b8bfb73fa8..26584b1860 100644 --- a/raphtory/examples/snb_loader.rs +++ b/raphtory/examples/snb_loader.rs @@ -1,9 +1,6 @@ #[cfg(feature = "io")] -use raphtory::io::{ - arrow::df_loaders::edges::ColumnNames, - parquet_loaders::{load_edges_from_parquet, load_nodes_from_parquet}, -}; -use raphtory::{errors::GraphError, prelude::*}; +use raphtory::io::parquet_loaders::{load_edges_from_parquet, load_nodes_from_parquet}; +use raphtory::{arrow_loader::df_loaders::edges::ColumnNames, errors::GraphError, prelude::*}; use std::path::{Path, PathBuf}; /// Construct the path to a named Parquet file inside `parquet_dir`. diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/arrow_loader/dataframe.rs similarity index 99% rename from raphtory/src/io/arrow/dataframe.rs rename to raphtory/src/arrow_loader/dataframe.rs index 74d75a5e3d..1a258a18ee 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/arrow_loader/dataframe.rs @@ -1,7 +1,7 @@ use crate::{ api::core::utils::time::TryIntoTime, + arrow_loader::node_col::{lift_node_col, NodeCol}, errors::{into_load_err, GraphError, LoadError}, - io::arrow::node_col::{lift_node_col, NodeCol}, }; use arrow::{ array::{cast::AsArray, Array, ArrayRef, PrimitiveArray}, diff --git a/raphtory/src/io/arrow/df_loaders/edge_props.rs b/raphtory/src/arrow_loader/df_loaders/edge_props.rs similarity index 98% rename from raphtory/src/io/arrow/df_loaders/edge_props.rs rename to raphtory/src/arrow_loader/df_loaders/edge_props.rs index 6f391238a0..f634d909f3 100644 --- a/raphtory/src/io/arrow/df_loaders/edge_props.rs +++ b/raphtory/src/arrow_loader/df_loaders/edge_props.rs @@ -1,10 +1,10 @@ #[cfg(feature = "progress")] -use crate::io::arrow::df_loaders::build_progress_bar; +use crate::arrow_loader::df_loaders::build_progress_bar; +#[cfg(feature = "progress")] +use kdam::BarExt; use crate::{ - db::api::view::StaticGraphViewOps, - errors::{into_graph_err, GraphError, LoadError}, - io::arrow::{ + arrow_loader::{ dataframe::{DFChunk, DFView}, df_loaders::{ edges::{get_or_resolve_node_vids, store_node_ids, ColumnNames}, @@ -13,13 +13,14 @@ use crate::{ layer_col::lift_layer_col, prop_handler::*, }, + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError, LoadError}, prelude::*, }; use arrow::{array::AsArray, datatypes::UInt64Type}; use bytemuck::checked::cast_slice_mut; use db4_graph::WriteLockedGraph; use itertools::izip; -use kdam::BarExt; use raphtory_api::{ atomic_extra::atomic_usize_from_mut_slice, core::entities::{properties::prop::AsPropRef, LayerId, EID}, diff --git a/raphtory/src/io/arrow/df_loaders/edges.rs b/raphtory/src/arrow_loader/df_loaders/edges.rs similarity index 76% rename from raphtory/src/io/arrow/df_loaders/edges.rs rename to raphtory/src/arrow_loader/df_loaders/edges.rs index 8edec1fb6f..40e28ff282 100644 --- a/raphtory/src/io/arrow/df_loaders/edges.rs +++ b/raphtory/src/arrow_loader/df_loaders/edges.rs @@ -1,28 +1,27 @@ #[cfg(feature = "progress")] -use crate::io::arrow::df_loaders::build_progress_bar; +use crate::arrow_loader::df_loaders::build_progress_bar; +#[cfg(feature = "progress")] +use kdam::BarExt; use crate::{ - db::api::{storage::storage::PersistenceStrategy, view::StaticGraphViewOps}, - errors::{into_graph_err, GraphError, LoadError}, - io::{ - arrow::{ - dataframe::{DFChunk, DFView}, - df_loaders::{ - extract_secondary_index_col, process_shared_properties, resolve_nodes_with_cache, - }, - layer_col::lift_layer_col, - node_col::NodeCol, - prop_handler::*, + arrow_loader::{ + dataframe::{DFChunk, DFView, SecondaryIndexCol}, + df_loaders::{ + extract_secondary_index_col, process_shared_properties, resolve_nodes_with_cache, }, + layer_col::lift_layer_col, + node_col::NodeCol, + prop_handler::*, LOAD_POOL, }, + db::api::{storage::storage::PersistenceStrategy, view::StaticGraphViewOps}, + errors::{into_graph_err, GraphError, LoadError}, prelude::*, }; use arrow::{array::AsArray, datatypes::UInt64Type}; use bytemuck::checked::cast_slice_mut; use db4_graph::WriteLockedGraph; use itertools::izip; -use kdam::BarExt; use raphtory_api::{ atomic_extra::{atomic_usize_from_mut_slice, atomic_vid_from_mut_slice}, core::{ @@ -227,15 +226,6 @@ pub fn load_edges_from_df = vec![]; // exists or needs to be created let mut layer_eids_exist: Vec = vec![]; // exists or needs to be created - // I want to find out which of the segments are touched by every chunk - let mut edge_segments_touched = (0..graph.core_graph().num_edge_segments()) - .map(|_| AtomicBool::new(false)) - .collect::>(); - - let mut node_segments_touched = (0..graph.core_graph().num_node_segments()) - .map(|_| AtomicBool::new(false)) - .collect::>(); - for chunk in df_view.chunks.into_iter() { let df = chunk?; let prop_cols = @@ -292,25 +282,16 @@ pub fn load_edges_from_df>( +fn update_edge_properties>( shared_metadata: &[(usize, Prop)], prop_cols: &PropCols, metadata_cols: &PropCols, shard: &mut LockedEdgePage<'_, ES>, - zip: impl Iterator, + zip: impl Iterator, delete: bool, ) { let mut t_props = vec![]; let mut c_props = vec![]; let mut writer = shard.writer(); - for (row, (src, dst, time, secondary_index, eid, layer, exists)) in zip.enumerate() { - if let Some(eid_pos) = writer.resolve_pos(*eid) { + for (row, src, dst, time, secondary_index, eid, layer, exists) in zip { + if let Some(eid_pos) = writer.resolve_pos(eid) { let t = EventTime(time, secondary_index); t_props.clear(); @@ -559,27 +551,28 @@ fn update_edge_properties<'a, ES: EdgeSegmentOps>( writer.bulk_add_edge( t, eid_pos, - *src, - *dst, + src, + dst, exists, - LayerId(*layer), + LayerId(layer), c_props.drain(..), t_props.drain(..), ); } else { - writer.bulk_delete_edge(t, eid_pos, *src, *dst, exists, LayerId(*layer)); + writer.bulk_delete_edge(t, eid_pos, src, dst, exists, LayerId(layer)); } } } } -fn update_inbound_edges<'a, NS: NodeSegmentOps>( +fn update_inbound_edges>( shard: &mut LockedNodePage<'_, NS>, - zip: impl Iterator, + zip: impl Iterator, delete: bool, ) { let mut writer = shard.writer(); for ( + _row, src, dst, eid, @@ -590,28 +583,28 @@ fn update_inbound_edges<'a, NS: NodeSegmentOps>( edge_exists_in_static_graph, ) in zip { - if let Some(dst_pos) = writer.resolve_pos(*dst) { + if let Some(dst_pos) = writer.resolve_pos(dst) { let t = EventTime(time, secondary_index); if !edge_exists_in_static_graph { - writer.add_static_inbound_edge(dst_pos, *src, *eid); + writer.add_static_inbound_edge(dst_pos, src, eid); } let elid = if delete { - eid.with_layer_deletion(LayerId(*layer)) + eid.with_layer_deletion(LayerId(layer)) } else { - eid.with_layer(LayerId(*layer)) + eid.with_layer(LayerId(layer)) }; if src != dst { if edge_exists_in_layer { writer.update_timestamp(t, dst_pos, elid); } else { - writer.add_inbound_edge(Some(t), dst_pos, *src, elid); + writer.add_inbound_edge(Some(t), dst_pos, src, elid); } } else { // self-loop edge, only add once if !edge_exists_in_layer { - writer.add_inbound_edge::(None, dst_pos, *src, elid); + writer.add_inbound_edge::(None, dst_pos, src, elid); } } } @@ -620,7 +613,6 @@ fn update_inbound_edges<'a, NS: NodeSegmentOps>( #[allow(clippy::type_complexity, clippy::too_many_arguments)] fn add_and_resolve_outbound_edges< - 'a, EXT: PersistenceStrategy, NS: NodeSegmentOps, ES: EdgeSegmentOps, @@ -628,29 +620,26 @@ fn add_and_resolve_outbound_edges< eids_exist: &[AtomicBool], layer_eids_exist: &[AtomicBool], eid_col_shared: &&mut [AtomicUsize], - edge_touched_segments: &[AtomicBool], - max_edge_page_len: u32, + zip: impl Iterator, next_edge_id: impl Fn(usize) -> EID, edges: &WriteLockedEdgePages<'_, ES>, locked_page: &mut LockedNodePage<'_, NS>, - zip: impl Iterator, delete: bool, ) { let mut writer = locked_page.writer(); - let mut last_edge_segment = usize::MAX; - for (row, (src, dst, time, secondary_index, layer)) in zip.enumerate() { - if let Some(src_pos) = writer.resolve_pos(*src) { + for (row, src, dst, time, secondary_index, layer) in zip { + if let Some(src_pos) = writer.resolve_pos(src) { let t = EventTime(time, secondary_index); // find the original EID in the static graph if it exists // otherwise create a new one - let edge_id = if let Some(edge_id) = writer.get_out_edge(src_pos, *dst, LayerId(0)) { + let edge_id = if let Some(edge_id) = writer.get_out_edge(src_pos, dst, LayerId(0)) { eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); eids_exist[row].store(true, Ordering::Relaxed); MaybeNew::Existing(edge_id) } else { let edge_id = next_edge_id(row); - writer.add_static_outbound_edge(src_pos, *dst, edge_id); + writer.add_static_outbound_edge(src_pos, dst, edge_id); eid_col_shared[row].store(edge_id.0, Ordering::Relaxed); eids_exist[row].store(false, Ordering::Relaxed); MaybeNew::New(edge_id) @@ -658,24 +647,16 @@ fn add_and_resolve_outbound_edges< let edge_id = edge_id.map(|eid| { if delete { - eid.with_layer_deletion(LayerId(*layer)) + eid.with_layer_deletion(LayerId(layer)) } else { - eid.with_layer(LayerId(*layer)) + eid.with_layer(LayerId(layer)) } }); - let (edge_segment, _) = resolve_pos(edge_id.inner().edge, max_edge_page_len); - if edge_segment != last_edge_segment { - if let Some(touched) = edge_touched_segments.get(edge_segment) { - touched.store(true, Ordering::Relaxed); - } - } - last_edge_segment = edge_segment; - let exists = !edge_id.is_new() && (edges.exists(edge_id.inner()) || writer - .get_out_edge(src_pos, *dst, edge_id.inner().layer()) + .get_out_edge(src_pos, dst, edge_id.inner().layer()) .is_some()); layer_eids_exist[row].store(exists, Ordering::Relaxed); @@ -683,12 +664,52 @@ fn add_and_resolve_outbound_edges< if exists { writer.update_timestamp(t, src_pos, edge_id.inner()); } else { - writer.add_outbound_edge(Some(t), src_pos, *dst, edge_id.inner()); + writer.add_outbound_edge(Some(t), src_pos, dst, edge_id.inner()); } } } } +fn group_rows_by_vid_segment( + vids: &[VID], + max_segment_len: u32, + num_segments: usize, +) -> Vec> { + let mut rows_by_segment = vec![Vec::new(); num_segments]; + for (row, vid) in vids.iter().enumerate() { + let (segment_id, _) = resolve_pos(vid.index(), max_segment_len); + let rows = rows_by_segment + .get_mut(segment_id) + .expect("segment not found while grouping by vid"); + rows.push(row); + } + rows_by_segment +} + +fn group_rows_by_eid_segment( + eids: &[EID], + max_segment_len: u32, + num_segments: usize, +) -> Vec> { + let mut rows_by_segment = vec![Vec::new(); num_segments]; + for (row, eid) in eids.iter().enumerate() { + let (segment_id, _) = resolve_pos(*eid, max_segment_len); + let rows = rows_by_segment + .get_mut(segment_id) + .expect("segment not found while grouping by eid"); + rows.push(row); + } + rows_by_segment +} + +#[inline(always)] +fn secondary_index_at(col: &SecondaryIndexCol, row: usize) -> usize { + match col { + SecondaryIndexCol::DataFrame(arr) => arr.value(row) as usize, + SecondaryIndexCol::Range(range) => range.start + row, + } +} + pub fn store_node_ids>( gid_str_cache: &[(GidRef<'_>, VID)], locked_page: &mut LockedNodePage<'_, NS>, diff --git a/raphtory/src/io/arrow/df_loaders/mod.rs b/raphtory/src/arrow_loader/df_loaders/mod.rs similarity index 99% rename from raphtory/src/io/arrow/df_loaders/mod.rs rename to raphtory/src/arrow_loader/df_loaders/mod.rs index b0183b4869..c9ddc6e923 100644 --- a/raphtory/src/io/arrow/df_loaders/mod.rs +++ b/raphtory/src/arrow_loader/df_loaders/mod.rs @@ -1,14 +1,15 @@ use crate::{ - db::api::view::StaticGraphViewOps, - errors::{into_graph_err, GraphError}, - io::arrow::{ + arrow_loader::{ dataframe::{DFChunk, DFView, SecondaryIndexCol}, df_loaders::edges::ColumnNames, node_col::NodeCol, prop_handler::*, }, + db::api::view::StaticGraphViewOps, + errors::{into_graph_err, GraphError}, prelude::*, }; +#[cfg(feature = "progress")] use kdam::{Bar, BarBuilder, BarExt}; use raphtory_api::core::{ entities::properties::prop::PropType, diff --git a/raphtory/src/io/arrow/df_loaders/nodes.rs b/raphtory/src/arrow_loader/df_loaders/nodes.rs similarity index 92% rename from raphtory/src/io/arrow/df_loaders/nodes.rs rename to raphtory/src/arrow_loader/df_loaders/nodes.rs index f3c23f9869..39282d2b56 100644 --- a/raphtory/src/io/arrow/df_loaders/nodes.rs +++ b/raphtory/src/arrow_loader/df_loaders/nodes.rs @@ -2,19 +2,6 @@ use crate::{ core::entities::nodes::node_ref::AsNodeRef, db::api::view::StaticGraphViewOps, errors::{into_graph_err, GraphError, LoadError}, - io::{ - arrow::{ - dataframe::{DFChunk, DFView}, - df_loaders::{ - extract_secondary_index_col, process_shared_properties, - resolve_nodes_and_type_with_cache, - }, - layer_col::{lift_layer_col, lift_node_type_col, LayerCol}, - node_col::NodeCol, - prop_handler::*, - }, - LOAD_POOL, - }, prelude::*, }; use arrow::{array::AsArray, datatypes::UInt64Type}; @@ -46,8 +33,17 @@ use storage::{ }; #[cfg(feature = "progress")] -use crate::io::arrow::df_loaders::build_progress_bar; - +use crate::arrow_loader::df_loaders::build_progress_bar; +use crate::arrow_loader::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + extract_secondary_index_col, process_shared_properties, resolve_nodes_and_type_with_cache, + }, + layer_col::{lift_layer_col, lift_node_type_col, LayerCol}, + node_col::NodeCol, + prop_handler::*, + LOAD_POOL, +}; #[cfg(feature = "progress")] use kdam::BarExt; @@ -245,6 +241,7 @@ pub fn load_nodes_from_df< Ok(()) } +/// Must be called from a single-threaded context if is_materializing == true && node_id_col.is_none() && node_type_id_col.is_none() #[allow(clippy::too_many_arguments)] pub fn load_node_props_from_df< 'a, @@ -259,6 +256,7 @@ pub fn load_node_props_from_df< metadata: &[&str], shared_metadata: Option<&HashMap>, graph: &G, + is_materializing: bool, ) -> Result<(), GraphError> { if df_view.is_empty() { return Ok(()); @@ -320,6 +318,7 @@ pub fn load_node_props_from_df< &df, &node_col, node_type_col, + is_materializing, )?; // We assume this is fast enough @@ -442,6 +441,7 @@ fn get_or_resolve_node_vids_no_events< df: &'b DFChunk, src_col: &'a NodeCol, node_type_col: LayerCol<'a>, + is_materializing: bool, ) -> Result<(&'c [VID], &'c [usize]), GraphError> { assert!(!(node_type_ids_col.is_none() ^ node_id_col.is_none())); // both some or both none if let Some((node_type_index, node_id_col)) = node_type_ids_col.zip(node_id_col) { @@ -462,6 +462,7 @@ fn get_or_resolve_node_vids_no_events< df, src_col, node_type_col, + is_materializing, ) } } @@ -476,6 +477,7 @@ fn resolve_node_and_meta_for_node_col< df: &DFChunk, src_col: &NodeCol, node_type_col: LayerCol<'a>, + is_materializing: bool, ) -> Result<(&'a [VID], &'a [usize]), GraphError> { node_col_resolved.resize_with(df.len(), Default::default); node_type_resolved.resize_with(df.len(), Default::default); @@ -505,9 +507,20 @@ fn resolve_node_and_meta_for_node_col< *node_type_id = id; } - let res_vid = graph - .internalise_node(gid.as_node_ref()) - .unwrap_or_default(); + // Create the node if it doesn't exist yet so metadata-only callers + // (e.g. materialize loading node c_props before t_props) still + // allocate a fresh VID in the target graph. + let res_vid = if is_materializing { + // Safe because load_node_props_from_df is called sequentially from the + // materialize_impl consumer loop (one record batch at a time), and the resolve loop is serial + // both here and in load_node_props_from_df, so no other thread resolves the same id concurrently. + // Other future callers should make sure to utilize this pathway in single-threaded contexts only. + unsafe { graph.bulk_load_resolve_node(gid).map_err(into_graph_err)? } + } else { + graph + .internalise_node(gid.as_node_ref()) + .unwrap_or_default() + }; *vid = res_vid; last_node_type = node_type; } diff --git a/raphtory/src/io/arrow/layer_col.rs b/raphtory/src/arrow_loader/layer_col.rs similarity index 99% rename from raphtory/src/io/arrow/layer_col.rs rename to raphtory/src/arrow_loader/layer_col.rs index 9a6493325e..61a1a86530 100644 --- a/raphtory/src/io/arrow/layer_col.rs +++ b/raphtory/src/arrow_loader/layer_col.rs @@ -1,8 +1,8 @@ use std::borrow::Cow; use crate::{ + arrow_loader::dataframe::DFChunk, errors::{into_graph_err, GraphError, LoadError}, - io::arrow::dataframe::DFChunk, prelude::AdditionOps, }; use arrow::array::{Array, AsArray, LargeStringArray, StringArray, StringViewArray}; diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/arrow_loader/mod.rs similarity index 95% rename from raphtory/src/io/arrow/mod.rs rename to raphtory/src/arrow_loader/mod.rs index 30fe0a9fc2..11ee5bd71f 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/arrow_loader/mod.rs @@ -1,3 +1,6 @@ +use rayon::{ThreadPool, ThreadPoolBuilder}; +use std::sync::LazyLock; + pub mod dataframe; pub mod df_loaders; mod layer_col; @@ -7,7 +10,7 @@ pub mod prop_handler; #[cfg(test)] mod test { use crate::{ - io::arrow::{ + arrow_loader::{ dataframe::{DFChunk, DFView}, df_loaders::{ edges::{load_edges_from_df_prefetch, ColumnNames}, @@ -208,3 +211,10 @@ mod test { ); } } + +pub(crate) static LOAD_POOL: LazyLock = LazyLock::new(|| { + ThreadPoolBuilder::new() + .thread_name(|idx| format!("PS Bulk Load Thread-{idx}")) + .build() + .unwrap() +}); diff --git a/raphtory/src/io/arrow/node_col.rs b/raphtory/src/arrow_loader/node_col.rs similarity index 98% rename from raphtory/src/io/arrow/node_col.rs rename to raphtory/src/arrow_loader/node_col.rs index 419647bc47..9a3fe3eae4 100644 --- a/raphtory/src/io/arrow/node_col.rs +++ b/raphtory/src/arrow_loader/node_col.rs @@ -1,6 +1,6 @@ use std::any::Any; -use crate::{errors::LoadError, io::arrow::dataframe::DFChunk, prelude::AdditionOps}; +use crate::{arrow_loader::dataframe::DFChunk, errors::LoadError, prelude::AdditionOps}; use arrow::{ array::{ Array, AsArray, Int32Array, Int64Array, LargeStringArray, StringArray, StringViewArray, diff --git a/raphtory/src/io/arrow/prop_handler.rs b/raphtory/src/arrow_loader/prop_handler.rs similarity index 96% rename from raphtory/src/io/arrow/prop_handler.rs rename to raphtory/src/arrow_loader/prop_handler.rs index 7216ce7ff8..d08499a843 100644 --- a/raphtory/src/io/arrow/prop_handler.rs +++ b/raphtory/src/arrow_loader/prop_handler.rs @@ -1,4 +1,4 @@ -use crate::{errors::GraphError, io::arrow::dataframe::DFChunk}; +use crate::{arrow_loader::dataframe::DFChunk, errors::GraphError}; use arrow::array::{Array, ArrayRef}; use raphtory_api::core::{ entities::properties::prop::{ diff --git a/raphtory/src/db/api/view/graph.rs b/raphtory/src/db/api/view/graph.rs index 73c935c7c4..0f66969cb7 100644 --- a/raphtory/src/db/api/view/graph.rs +++ b/raphtory/src/db/api/view/graph.rs @@ -1,11 +1,21 @@ #[cfg(feature = "io")] use crate::serialise::GraphPaths; use crate::{ + arrow_loader::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + edge_props::load_edges_from_df as load_edge_props_from_df, + edges::{load_edges_from_df, ColumnNames}, + load_edge_deletions_from_df, load_graph_props_from_df, + nodes::{load_node_props_from_df, load_nodes_from_df}, + }, + LOAD_POOL, + }, core::entities::{nodes::node_ref::AsNodeRef, LayerIds, VID}, db::{ api::{ - properties::{internal::InternalMetadataOps, Metadata, Properties}, - state::{ops::filter::NodeTypeFilterOp, Index}, + properties::{Metadata, Properties}, + state::ops::filter::NodeTypeFilterOp, view::{internal::*, *}, }, graph::{ @@ -20,21 +30,22 @@ use crate::{ }, }, errors::GraphError, + parquet_encoder::{ + encode_edge_cprop, encode_edge_deletions, encode_edge_tprop, encode_graph_cprop, + encode_graph_tprop, encode_nodes_cprop, encode_nodes_tprop, RecordBatchSink, DST_COL_ID, + DST_COL_VID, EDGE_COL_ID, ENCODE_POOL, LAYER_COL, LAYER_ID_COL, NODE_ID_COL, NODE_VID_COL, + SECONDARY_INDEX_COL, SRC_COL_ID, SRC_COL_VID, TIME_COL, TYPE_COL, TYPE_ID_COL, + }, prelude::*, }; use ahash::HashSet; +use arrow::array::RecordBatch; use db4_graph::TemporalGraph; use itertools::Itertools; -use raphtory_api::{ - atomic_extra::atomic_usize_from_mut_slice, - core::{ - entities::{ - properties::meta::{Meta, PropMapper, STATIC_GRAPH_LAYER_ID}, - LayerId, EID, - }, - storage::{arc_str::ArcStr, timeindex::EventTime}, - Direction, - }, +use raphtory_api::core::{ + entities::properties::meta::{Meta, PropMapper}, + storage::{arc_str::ArcStr, timeindex::EventTime}, + Direction, }; use raphtory_core::utils::iter::GenLockedIter; use raphtory_storage::{ @@ -42,17 +53,11 @@ use raphtory_storage::{ edges::edge_storage_ops::EdgeStorageOps, graph::GraphStorage, nodes::node_storage_ops::NodeStorageOps, }, - mutation::{ - addition_ops::{InternalAdditionOps, SessionAdditionOps}, - MutationError, - }, + mutation::addition_ops::SessionAdditionOps, }; use rayon::prelude::*; use rustc_hash::FxHashSet; -use std::{ - path::Path, - sync::{atomic::Ordering, Arc}, -}; +use std::{path::Path, sync::Arc}; use storage::{persist::strategy::PersistenceStrategy, Config, Extension}; #[cfg(feature = "search")] @@ -243,16 +248,107 @@ fn edges_inner<'graph, G: GraphView + 'graph>(g: &G, locked: bool) -> Edges<'gra } } -fn materialize_impl( +fn df_view_from_record_batch( + batch: RecordBatch, +) -> DFView> + Send> { + let (schema, columns, num_rows) = batch.into_parts(); + let field_names = schema + .fields() + .iter() + .map(|field| field.name().to_string()) + .collect(); + + DFView::new( + field_names, + std::iter::once(Ok(DFChunk::new(columns))), + Some(num_rows), + ) +} + +fn df_columns_except(names: &[String], exclude: &[&str]) -> Vec { + names + .iter() + .map(|name| name.as_str()) + .filter(|name| !exclude.contains(name)) + .map(str::to_string) + .collect() +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub(crate) enum RecordBatchKind { + EdgesT, + EdgesC, + EdgesD, + NodesT, + NodesC, + GraphT, + GraphC, +} + +#[derive(Debug, Clone)] +pub(crate) struct RecordBatchMessage { + batch: RecordBatch, + kind: RecordBatchKind, +} + +impl RecordBatchMessage { + pub(crate) fn kind(&self) -> RecordBatchKind { + self.kind + } + + pub(crate) fn into_batch(self) -> RecordBatch { + self.batch + } +} + +/// This RecordBatchSink allows for RecordBatches to be sent from multithreaded contexts (such as the parquet serializer) +/// to be consumed by the receiver. +#[derive(Debug, Clone)] +pub(crate) struct ChannelRecordBatchSink { + tx: crossbeam_channel::Sender, + kind: RecordBatchKind, +} + +impl ChannelRecordBatchSink { + pub(crate) fn new( + tx: crossbeam_channel::Sender, + kind: RecordBatchKind, + ) -> Self { + Self { tx, kind } + } +} + +impl RecordBatchSink for ChannelRecordBatchSink { + fn send_batch(&mut self, batch: RecordBatch) -> Result<(), GraphError> { + // sinks propagate their record batch kind to their messages + let record_batch_message = RecordBatchMessage { + batch, + kind: self.kind, + }; + + self.tx + .send(record_batch_message) + .map_err(|e| GraphError::IOErrorMsg(format!("RecordBatch receiver was dropped: {e}"))) + } + + fn finish(self) -> Result<(), GraphError> { + // implicitly drops self, so the transmitter (tx) is dropped as well + Ok(()) + } +} + +pub fn materialize_impl( graph: &impl GraphView, path: Option<&Path>, config: Config, ) -> Result { - let storage = graph.core_graph().lock(); let mut node_meta = Meta::new_for_nodes(); let mut edge_meta = Meta::new_for_edges(); let mut graph_props_meta = Meta::new_for_graph_props(); + // Preserve property mappers from the source graph so that + // windowed views expose the same prop mappings even for keys with no + // values inside the window. node_meta.set_metadata_mapper(graph.node_meta().metadata_mapper().deep_clone()); node_meta.set_temporal_prop_mapper(graph.node_meta().temporal_prop_mapper().deep_clone()); edge_meta.set_metadata_mapper(graph.edge_meta().metadata_mapper().deep_clone()); @@ -261,53 +357,30 @@ fn materialize_impl( graph_props_meta .set_temporal_prop_mapper(graph.graph_props_meta().temporal_prop_mapper().deep_clone()); + let base_layer_meta = graph.edge_meta().layer_meta(); let layer_meta = edge_meta.layer_meta(); - // NOTE: layers must be set in layer_meta before the TemporalGraph is initialized to // make sure empty layers are created. - let layer_map: Vec<_> = match graph.layer_ids() { - LayerIds::None => { - // no layers to map - vec![] - } + match graph.layer_ids() { + LayerIds::None => {} LayerIds::All => { - let layers = storage.edge_meta().layer_meta().keys(); - let mut layer_map = vec![0; storage.edge_meta().layer_meta().num_all_fields()]; - - for (id, name) in storage.edge_meta().layer_meta().ids().zip(layers.iter()) { - let new_id = layer_meta.get_or_create_id(name).inner(); - layer_map[id] = new_id; + for name in base_layer_meta.keys().iter() { + layer_meta.get_or_create_id(name); } - - layer_map } - LayerIds::One(l_id) => { - let mut layer_map = vec![0; storage.edge_meta().layer_meta().num_all_fields()]; - let layer_name = storage.edge_meta().get_layer_name_by_id(*l_id); - let new_id = layer_meta.get_or_create_id(&layer_name).inner(); - - layer_map[l_id.0] = new_id; - layer_map + LayerIds::One(id) => { + layer_meta.get_or_create_id(&base_layer_meta.get_name(id.0)); } LayerIds::Multiple(ids) => { - let mut layer_map = vec![0; storage.edge_meta().layer_meta().num_all_fields()]; - let layers = storage.edge_meta().layer_meta().all_keys(); - + let all_layers = base_layer_meta.all_keys(); for id in ids { - let layer_name = &layers[id.0]; - let new_id = layer_meta.get_or_create_id(layer_name).inner(); - layer_map[id.0] = new_id; + layer_meta.get_or_create_id(&all_layers[id.0]); } - - layer_map } - }; - - node_meta.set_layer_mapper(layer_meta.clone()); + } + node_meta.set_layer_mapper(layer_meta.deep_clone()); - // Create new WAL file for the new materialized graph. let ext = Extension::new(config, path)?; - let temporal_graph = TemporalGraph::new_with_meta( path.map(|p| p.into()), node_meta, @@ -318,288 +391,613 @@ fn materialize_impl( if let Some(earliest) = graph.earliest_time() { temporal_graph.update_time(earliest); - }; + } if let Some(latest) = graph.latest_time() { temporal_graph.update_time(latest); - }; + } - // Set event counter to be the same as old graph to avoid any possibility for duplicate event ids temporal_graph .storage() - .set_event_id(storage.read_event_id()); - - let temporal_graph = Arc::new(temporal_graph); - let graph_storage = GraphStorage::from(temporal_graph.clone()); - - { - // scope for the write lock - - // reverse index pos -> new_vid - let index = Index::for_graph(graph); - let mut node_map = vec![VID::default(); index.len()]; - let node_map_shared = atomic_usize_from_mut_slice(bytemuck::cast_slice_mut(&mut node_map)); + .set_event_id(graph.core_graph().lock().read_event_id()); + + let graph_storage = GraphStorage::from(Arc::new(temporal_graph)); + let materialized = graph.new_base_graph(graph_storage); + + let stream_capacity = 10; + let (tx, rx) = crossbeam_channel::bounded::(stream_capacity); + + let mut scope_result = Ok(()); + // Use std::thread::scope rather than rayon::scope so the producer runs on its own OS thread. + // With rayon::scope on a single-thread pool, the main thread blocking on rx.recv() would starve the spawned producer. + std::thread::scope(|scope| { + let producer_tx = tx.clone(); + let producer_handle = scope.spawn(move || { + let make_sink_factory = |kind| { + let tx = producer_tx.clone(); + move |_, _, _| Ok(ChannelRecordBatchSink::new(tx.clone(), kind)) + }; - index.par_iter().for_each(|(_, vid)| { - if let Some(pos) = index.index(&vid) { - let new_vid = temporal_graph.storage().nodes().reserve_vid(pos); - node_map_shared[pos].store(new_vid.index(), Ordering::Relaxed); - } + // EdgesD must run before EdgesC: edges that exist only via + // deletions (e.g. in a windowed persistent graph) aren't + // produced by EdgesT, so the deletion pass is what + // materializes them. The edge-metadata loader then expects + // every layer-edge it sees to already exist. + // NodesC must run before NodesT as well. + ENCODE_POOL.install(|| -> Result<(), GraphError> { + encode_nodes_cprop(graph, make_sink_factory(RecordBatchKind::NodesC))?; + encode_nodes_tprop(graph, make_sink_factory(RecordBatchKind::NodesT))?; + encode_edge_tprop(graph, make_sink_factory(RecordBatchKind::EdgesT))?; + encode_edge_deletions(graph, make_sink_factory(RecordBatchKind::EdgesD))?; + encode_edge_cprop(graph, make_sink_factory(RecordBatchKind::EdgesC))?; + encode_graph_tprop(graph, make_sink_factory(RecordBatchKind::GraphT))?; + encode_graph_cprop(graph, make_sink_factory(RecordBatchKind::GraphC))?; + Ok(()) + }) }); - let get_new_vid = |old_vid: VID, index: &Index, node_map: &[VID]| -> VID { - let pos = index - .index(&old_vid) - .expect("old_vid should exist in index"); - node_map[pos] - }; - let mut new_storage = graph_storage.write_lock()?; - - for layer_id in &layer_map { - new_storage.nodes.ensure_layer(LayerId(*layer_id)); - } - - new_storage.nodes.par_iter_mut().try_for_each(|shard| { - for (pos, vid) in index.iter().enumerate() { - let new_id = node_map[pos]; - if let Some(node_pos) = shard.resolve_pos(new_id) { - let node = NodeView::new_internal(graph, vid); - let gid = node.id(); - let mut writer = shard.writer(); - - if let Some(node_type) = node.node_type() { - let new_type_id = graph_storage - .node_meta() - .node_type_meta() - .get_or_create_id(&node_type) - .inner(); - writer.store_node_id_and_node_type( - node_pos, - STATIC_GRAPH_LAYER_ID, - gid.as_ref(), - new_type_id, - ); - } else { - writer.store_node_id(node_pos, STATIC_GRAPH_LAYER_ID, gid.clone()); - } - - graph_storage - .write_session()? - .set_node(gid.as_ref(), new_id)?; - - for (t, l, row) in node.rows() { - writer.add_props(t, node_pos, LayerId(l.0), row); // TODO: Fix me - } - - writer.update_c_props( - node_pos, - STATIC_GRAPH_LAYER_ID, - node.metadata_ids() - .filter_map(|id| node.get_metadata(id).map(|prop| (id, prop))), - ); - } - } + drop(tx); - Ok::<(), MutationError>(()) - })?; + let consumer_result = loop { + let message = match rx.recv() { + Ok(message) => message, + Err(_) => break Ok(()), // error here means the channel is empty and disconnected + }; - let mut new_eids = vec![]; - let mut max_eid = 0usize; - for (row, _) in graph.edges().iter().enumerate() { - let new_eid = new_storage.graph().storage().edges().reserve_new_eid(row); - new_eids.push(new_eid); - max_eid = new_eid.0.max(max_eid); - } - new_storage.resize_segments_to_eid(EID(max_eid)); + let kind = message.kind(); + let df_view = df_view_from_record_batch(message.into_batch()); - for layer_id in &layer_map { - new_storage.edges.ensure_layer(LayerId(*layer_id)); - } + let result = match kind { + RecordBatchKind::NodesC => { + let node_c_props = df_columns_except( + &df_view.names, + &[NODE_ID_COL, NODE_VID_COL, TYPE_COL, TYPE_ID_COL], + ); + let node_c_props_refs = + node_c_props.iter().map(String::as_str).collect::>(); - new_storage.edges.par_iter_mut().try_for_each(|shard| { - for (row, edge) in graph.edges().iter().enumerate() { - let src = get_new_vid(edge.edge.src(), &index, &node_map); - let dst = get_new_vid(edge.edge.dst(), &index, &node_map); - let eid = new_eids[row]; - if let Some(edge_pos) = shard.resolve_pos(eid) { - let mut writer = shard.writer(); - // make the edge for the first time - writer.add_static_edge(Some(edge_pos), src, dst, false); - - for edge in edge.explode_layers() { - let layer = LayerId(layer_map[edge.edge.layer().unwrap().0]); - for edge in edge.explode() { - let t = edge.edge.time().unwrap(); - writer.add_edge(t, edge_pos, src, dst, [], layer); - } - //TODO: move this in edge.row() - for (t, t_props) in edge - .properties() - .temporal() - .values() - .map(|tp| { - let prop_id = tp.id(); - tp.iter_indexed() - .map(|(t, prop)| (t, prop_id, prop)) - .collect::>() - }) - .kmerge_by(|(t, _, _), (t2, _, _)| t <= t2) - .chunk_by(|(t, _, _)| *t) - .into_iter() - { - let props = t_props - .map(|(_, prop_id, prop)| (prop_id, prop)) - .collect::>(); - writer.add_edge(t, edge_pos, src, dst, props, layer); - } - writer.update_c_props( - edge_pos, - src, - dst, - layer, - edge.clone().metadata_ids().filter_map(move |prop_id| { - edge.get_metadata(prop_id).map(|prop| (prop_id, prop)) - }), - ); - } - - let time_semantics = graph.edge_time_semantics(); - let edge_entry = graph.core_edge(edge.edge.pid()); - for (t, layer) in time_semantics.edge_deletion_history( - edge_entry.as_ref(), - graph, - graph.layer_ids(), - ) { - let layer = LayerId(layer_map[layer.0]); - writer.delete_edge(t, edge_pos, src, dst, layer); - } + LOAD_POOL.install(|| { + load_node_props_from_df( + df_view, + NODE_ID_COL, + None, + Some(TYPE_COL), + None, + None, + &node_c_props_refs, + None, + &materialized, + true, + ) + }) } - } - Ok::<(), MutationError>(()) - })?; - - new_storage.nodes.par_iter_mut().try_for_each(|shard| { - for (row, edge) in graph.edges().iter().enumerate() { - let eid = new_eids[row]; - let src_id = get_new_vid(edge.edge.src(), &index, &node_map); - let dst_id = get_new_vid(edge.edge.dst(), &index, &node_map); - let maybe_src_pos = shard.resolve_pos(src_id); - let maybe_dst_pos = shard.resolve_pos(dst_id); - - if let Some(node_pos) = maybe_src_pos { - let mut writer = shard.writer(); - writer.add_static_outbound_edge(node_pos, dst_id, eid); + RecordBatchKind::NodesT => { + let node_t_props = df_columns_except( + &df_view.names, + &[ + NODE_ID_COL, + NODE_VID_COL, + TYPE_COL, + TIME_COL, + SECONDARY_INDEX_COL, + ], + ); + let node_t_props_refs = + node_t_props.iter().map(String::as_str).collect::>(); + + load_nodes_from_df( + df_view, + TIME_COL, + Some(SECONDARY_INDEX_COL), + NODE_ID_COL, + &node_t_props_refs, + &[], + None, + None, + Some(TYPE_COL), + &materialized, + true, + None, + None, + ) } - - if let Some(node_pos) = maybe_dst_pos { - let mut writer = shard.writer(); - writer.add_static_inbound_edge(node_pos, src_id, eid); + RecordBatchKind::EdgesT => { + let edge_t_props = df_columns_except( + &df_view.names, + &[ + TIME_COL, + SECONDARY_INDEX_COL, + SRC_COL_VID, + SRC_COL_ID, + DST_COL_VID, + DST_COL_ID, + EDGE_COL_ID, + LAYER_COL, + LAYER_ID_COL, + ], + ); + let edge_t_props_refs = + edge_t_props.iter().map(String::as_str).collect::>(); + + LOAD_POOL.install(|| { + load_edges_from_df( + df_view, + ColumnNames::new( + TIME_COL, + Some(SECONDARY_INDEX_COL), + SRC_COL_ID, + DST_COL_ID, + Some(LAYER_COL), + ), + true, + &edge_t_props_refs, + &[], + None, + None, + &materialized, + false, + ) + }) } - - for e in edge.explode_layers() { - let layer = LayerId(layer_map[e.edge.layer().unwrap().0]); - if let Some(node_pos) = maybe_src_pos { - let mut writer = shard.writer(); - writer.add_outbound_edge::( + RecordBatchKind::EdgesC => { + let edge_c_props = df_columns_except( + &df_view.names, + &[ + SRC_COL_VID, + SRC_COL_ID, + DST_COL_VID, + DST_COL_ID, + EDGE_COL_ID, + LAYER_COL, + ], + ); + let edge_c_props_refs = + edge_c_props.iter().map(String::as_str).collect::>(); + + LOAD_POOL.install(|| { + load_edge_props_from_df( + df_view, + ColumnNames::new("", None, SRC_COL_ID, DST_COL_ID, Some(LAYER_COL)), + true, + &edge_c_props_refs, None, - node_pos, - dst_id, - eid.with_layer(layer), - ); - } - if let Some(node_pos) = maybe_dst_pos { - let mut writer = shard.writer(); - writer.add_inbound_edge::( None, - node_pos, - src_id, - eid.with_layer(layer), - ); - } + &materialized, + ) + }) } - - for e in edge.explode() { - if let Some(src_pos) = maybe_src_pos { - let mut writer = shard.writer(); - - let t = e.time().expect("exploded edge should have time"); - let l = LayerId(layer_map[e.edge.layer().unwrap().0]); - writer.update_timestamp(t, src_pos, eid.with_layer(l)); - } - if let Some(dst_pos) = maybe_dst_pos { - if maybe_src_pos.is_none_or(|src_pos| src_pos != dst_pos) { - let mut writer = shard.writer(); - - let t = e.time().expect("exploded edge should have time"); - let l = LayerId(layer_map[e.edge.layer().unwrap().0]); - writer.update_timestamp(t, dst_pos, eid.with_layer(l)); - } - } + RecordBatchKind::EdgesD => LOAD_POOL.install(|| { + load_edge_deletions_from_df( + df_view, + ColumnNames::new( + TIME_COL, + Some(SECONDARY_INDEX_COL), + SRC_COL_ID, + DST_COL_ID, + Some(LAYER_COL), + ), + true, + None, + &materialized, + ) + }), + RecordBatchKind::GraphT => { + let graph_t_props = + df_columns_except(&df_view.names, &[TIME_COL, SECONDARY_INDEX_COL]); + let graph_t_props_refs = + graph_t_props.iter().map(String::as_str).collect::>(); + + LOAD_POOL.install(|| { + load_graph_props_from_df( + df_view, + TIME_COL, + Some(SECONDARY_INDEX_COL), + Some(&graph_t_props_refs), + None, + &materialized, + ) + }) } - - let edge_time_semantics = graph.edge_time_semantics(); - let edge_entry = graph.core_edge(edge.edge.pid()); - for (t, layer) in edge_time_semantics.edge_deletion_history( - edge_entry.as_ref(), - graph, - graph.layer_ids(), - ) { - let layer = LayerId(layer_map[layer.0]); - if let Some(src_pos) = maybe_src_pos { - let mut writer = shard.writer(); - writer.update_timestamp(t, src_pos, eid.with_layer_deletion(layer)); - } - if let Some(dst_pos) = maybe_dst_pos { - if maybe_src_pos.is_none_or(|src_pos| src_pos != dst_pos) { - let mut writer = shard.writer(); - writer.update_timestamp(t, dst_pos, eid.with_layer_deletion(layer)); - } - } + RecordBatchKind::GraphC => { + let graph_c_props = df_columns_except(&df_view.names, &[TIME_COL]); + let graph_c_props_refs = + graph_c_props.iter().map(String::as_str).collect::>(); + + LOAD_POOL.install(|| { + load_graph_props_from_df( + df_view, + TIME_COL, + None, + None, + Some(&graph_c_props_refs), + &materialized, + ) + }) } - } - - Ok::<(), MutationError>(()) - })?; + }; - // Copy over graph properties - { - let graph_writer = new_storage.graph_props.writer(); - // Copy temporal properties - for (prop_name, temporal_prop) in graph.properties().temporal().iter() { - let prop_id = graph_storage - .graph_props_meta() - .temporal_prop_mapper() - .get_or_create_id(&prop_name) - .inner(); - - for (t, prop_value) in temporal_prop.iter_indexed() { - graph_writer.add_properties(t, [(prop_id, prop_value)]); - } + if let Err(err) = result { + break Err(err); } + }; - // Copy metadata (constant properties) - let metadata_props: Vec<_> = graph - .metadata() - .iter_filtered() - .map(|(prop_name, prop_value)| { - let prop_id = graph_storage - .graph_props_meta() - .metadata_mapper() - .get_or_create_id(&prop_name) - .inner(); - (prop_id, prop_value) - }) - .collect(); + drop(rx); - if !metadata_props.is_empty() { - graph_writer.update_metadata(metadata_props); - } - } - } + let producer_result = producer_handle.join().unwrap_or_else(|_| { + Err(GraphError::IOErrorMsg( + "record batch producer scope exited without reporting a result".to_string(), + )) + }); - Ok(graph.new_base_graph(graph_storage)) + scope_result = consumer_result.and(producer_result); + }); // std::thread::scope + scope_result?; + + Ok(materialized) } +// fn materialize_impl_old( +// graph: &impl GraphView, +// path: Option<&Path>, +// config: Config, +// ) -> Result { +// let storage = graph.core_graph().lock(); +// let mut node_meta = Meta::new_for_nodes(); +// let mut edge_meta = Meta::new_for_edges(); +// let mut graph_props_meta = Meta::new_for_graph_props(); +// +// node_meta.set_metadata_mapper(graph.node_meta().metadata_mapper().deep_clone()); +// node_meta.set_temporal_prop_mapper(graph.node_meta().temporal_prop_mapper().deep_clone()); +// edge_meta.set_metadata_mapper(graph.edge_meta().metadata_mapper().deep_clone()); +// edge_meta.set_temporal_prop_mapper(graph.edge_meta().temporal_prop_mapper().deep_clone()); +// graph_props_meta.set_metadata_mapper(graph.graph_props_meta().metadata_mapper().deep_clone()); +// graph_props_meta +// .set_temporal_prop_mapper(graph.graph_props_meta().temporal_prop_mapper().deep_clone()); +// +// let layer_meta = edge_meta.layer_meta(); +// +// // NOTE: layers must be set in layer_meta before the TemporalGraph is initialized to +// // make sure empty layers are created. +// let layer_map: Vec<_> = match graph.layer_ids() { +// LayerIds::None => { +// // no layers to map +// vec![] +// } +// LayerIds::All => { +// let layers = storage.edge_meta().layer_meta().keys(); +// let mut layer_map = vec![0; storage.edge_meta().layer_meta().num_all_fields()]; +// +// for (id, name) in storage.edge_meta().layer_meta().ids().zip(layers.iter()) { +// let new_id = layer_meta.get_or_create_id(name).inner(); +// layer_map[id] = new_id; +// } +// +// layer_map +// } +// LayerIds::One(l_id) => { +// let mut layer_map = vec![0; storage.edge_meta().layer_meta().num_all_fields()]; +// let layer_name = storage.edge_meta().get_layer_name_by_id(*l_id); +// let new_id = layer_meta.get_or_create_id(&layer_name).inner(); +// +// layer_map[l_id.0] = new_id; +// layer_map +// } +// LayerIds::Multiple(ids) => { +// let mut layer_map = vec![0; storage.edge_meta().layer_meta().num_all_fields()]; +// let layers = storage.edge_meta().layer_meta().all_keys(); +// +// for id in ids { +// let layer_name = &layers[id.0]; +// let new_id = layer_meta.get_or_create_id(layer_name).inner(); +// layer_map[id.0] = new_id; +// } +// +// layer_map +// } +// }; +// +// node_meta.set_layer_mapper(layer_meta.clone()); +// +// // Create new WAL file for the new materialized graph. +// let ext = Extension::new(config, path)?; +// +// let temporal_graph = TemporalGraph::new_with_meta( +// path.map(|p| p.into()), +// node_meta, +// edge_meta, +// graph_props_meta, +// ext, +// )?; +// +// if let Some(earliest) = graph.earliest_time() { +// temporal_graph.update_time(earliest); +// }; +// +// if let Some(latest) = graph.latest_time() { +// temporal_graph.update_time(latest); +// }; +// +// // Set event counter to be the same as old graph to avoid any possibility for duplicate event ids +// temporal_graph +// .storage() +// .set_event_id(storage.read_event_id()); +// +// let temporal_graph = Arc::new(temporal_graph); +// let graph_storage = GraphStorage::from(temporal_graph.clone()); +// +// { +// // scope for the write lock +// +// // reverse index pos -> new_vid +// let index = Index::for_graph(graph); +// let mut node_map = vec![VID::default(); index.len()]; +// let node_map_shared = atomic_usize_from_mut_slice(bytemuck::cast_slice_mut(&mut node_map)); +// +// index.par_iter().for_each(|(_, vid)| { +// if let Some(pos) = index.index(&vid) { +// let new_vid = temporal_graph.storage().nodes().reserve_vid(pos); +// node_map_shared[pos].store(new_vid.index(), Ordering::Relaxed); +// } +// }); +// +// let get_new_vid = |old_vid: VID, index: &Index, node_map: &[VID]| -> VID { +// let pos = index +// .index(&old_vid) +// .expect("old_vid should exist in index"); +// node_map[pos] +// }; +// let mut new_storage = graph_storage.write_lock()?; +// +// for layer_id in &layer_map { +// new_storage.nodes.ensure_layer(LayerId(*layer_id)); +// } +// +// new_storage.nodes.par_iter_mut().try_for_each(|shard| { +// for (pos, vid) in index.iter().enumerate() { +// let new_id = node_map[pos]; +// if let Some(node_pos) = shard.resolve_pos(new_id) { +// let node = NodeView::new_internal(graph, vid); +// let gid = node.id(); +// let mut writer = shard.writer(); +// +// if let Some(node_type) = node.node_type() { +// let new_type_id = graph_storage +// .node_meta() +// .node_type_meta() +// .get_or_create_id(&node_type) +// .inner(); +// writer.store_node_id_and_node_type( +// node_pos, +// STATIC_GRAPH_LAYER_ID, +// gid.as_ref(), +// new_type_id, +// ); +// } else { +// writer.store_node_id(node_pos, STATIC_GRAPH_LAYER_ID, gid.clone()); +// } +// +// graph_storage +// .write_session()? +// .set_node(gid.as_ref(), new_id)?; +// +// for (t, l, row) in node.rows() { +// writer.add_props(t, node_pos, LayerId(l.0), row); // TODO: Fix me +// } +// +// writer.update_c_props( +// node_pos, +// STATIC_GRAPH_LAYER_ID, +// node.metadata_ids() +// .filter_map(|id| node.get_metadata(id).map(|prop| (id, prop))), +// ); +// } +// } +// +// Ok::<(), MutationError>(()) +// })?; +// println!("Nodes 1 done at {}", Local::now()); +// +// let mut new_eids = vec![]; +// let mut max_eid = 0usize; +// for (row, _) in graph.edges().iter().enumerate() { +// let new_eid = new_storage.graph().storage().edges().reserve_new_eid(row); +// new_eids.push(new_eid); +// max_eid = new_eid.0.max(max_eid); +// } +// new_storage.resize_segments_to_eid(EID(max_eid)); +// +// for layer_id in &layer_map { +// new_storage.edges.ensure_layer(LayerId(*layer_id)); +// } +// +// new_storage.edges.par_iter_mut().try_for_each(|shard| { +// for (row, edge) in graph.edges().iter().enumerate() { +// let src = get_new_vid(edge.edge.src(), &index, &node_map); +// let dst = get_new_vid(edge.edge.dst(), &index, &node_map); +// let eid = new_eids[row]; +// if let Some(edge_pos) = shard.resolve_pos(eid) { +// let mut writer = shard.writer(); +// // make the edge for the first time +// writer.add_static_edge(Some(edge_pos), src, dst, false); +// +// for edge in edge.explode_layers() { +// let layer = LayerId(layer_map[edge.edge.layer().unwrap().0]); +// for edge in edge.explode() { +// let t = edge.edge.time().unwrap(); +// writer.add_edge(t, edge_pos, src, dst, [], layer); +// } +// //TODO: move this in edge.row() +// for (t, t_props) in edge +// .properties() +// .temporal() +// .values() +// .map(|tp| { +// let prop_id = tp.id(); +// tp.iter_indexed() +// .map(|(t, prop)| (t, prop_id, prop)) +// .collect::>() +// }) +// .kmerge_by(|(t, _, _), (t2, _, _)| t <= t2) +// .chunk_by(|(t, _, _)| *t) +// .into_iter() +// { +// let props = t_props +// .map(|(_, prop_id, prop)| (prop_id, prop)) +// .collect::>(); +// writer.add_edge(t, edge_pos, src, dst, props, layer); +// } +// writer.update_c_props( +// edge_pos, +// src, +// dst, +// layer, +// edge.clone().metadata_ids().filter_map(move |prop_id| { +// edge.get_metadata(prop_id).map(|prop| (prop_id, prop)) +// }), +// ); +// } +// +// let time_semantics = graph.edge_time_semantics(); +// let edge_entry = graph.core_edge(edge.edge.pid()); +// for (t, layer) in time_semantics.edge_deletion_history( +// edge_entry.as_ref(), +// graph, +// graph.layer_ids(), +// ) { +// let layer = LayerId(layer_map[layer.0]); +// writer.delete_edge(t, edge_pos, src, dst, layer); +// } +// } +// } +// Ok::<(), MutationError>(()) +// })?; +// println!("Edges 1 done at {}", Local::now()); +// +// new_storage.nodes.par_iter_mut().try_for_each(|shard| { +// for (row, edge) in graph.edges().iter().enumerate() { +// let eid = new_eids[row]; +// let src_id = get_new_vid(edge.edge.src(), &index, &node_map); +// let dst_id = get_new_vid(edge.edge.dst(), &index, &node_map); +// let maybe_src_pos = shard.resolve_pos(src_id); +// let maybe_dst_pos = shard.resolve_pos(dst_id); +// +// if let Some(node_pos) = maybe_src_pos { +// let mut writer = shard.writer(); +// writer.add_static_outbound_edge(node_pos, dst_id, eid); +// } +// +// if let Some(node_pos) = maybe_dst_pos { +// let mut writer = shard.writer(); +// writer.add_static_inbound_edge(node_pos, src_id, eid); +// } +// +// for e in edge.explode_layers() { +// let layer = LayerId(layer_map[e.edge.layer().unwrap().0]); +// if let Some(node_pos) = maybe_src_pos { +// let mut writer = shard.writer(); +// writer.add_outbound_edge::( +// None, +// node_pos, +// dst_id, +// eid.with_layer(layer), +// ); +// } +// if let Some(node_pos) = maybe_dst_pos { +// let mut writer = shard.writer(); +// writer.add_inbound_edge::( +// None, +// node_pos, +// src_id, +// eid.with_layer(layer), +// ); +// } +// } +// +// for e in edge.explode() { +// if let Some(src_pos) = maybe_src_pos { +// let mut writer = shard.writer(); +// +// let t = e.time().expect("exploded edge should have time"); +// let l = LayerId(layer_map[e.edge.layer().unwrap().0]); +// writer.update_timestamp(t, src_pos, eid.with_layer(l)); +// } +// if let Some(dst_pos) = maybe_dst_pos { +// if maybe_src_pos.is_none_or(|src_pos| src_pos != dst_pos) { +// let mut writer = shard.writer(); +// +// let t = e.time().expect("exploded edge should have time"); +// let l = LayerId(layer_map[e.edge.layer().unwrap().0]); +// writer.update_timestamp(t, dst_pos, eid.with_layer(l)); +// } +// } +// } +// +// let edge_time_semantics = graph.edge_time_semantics(); +// let edge_entry = graph.core_edge(edge.edge.pid()); +// for (t, layer) in edge_time_semantics.edge_deletion_history( +// edge_entry.as_ref(), +// graph, +// graph.layer_ids(), +// ) { +// let layer = LayerId(layer_map[layer.0]); +// if let Some(src_pos) = maybe_src_pos { +// let mut writer = shard.writer(); +// writer.update_timestamp(t, src_pos, eid.with_layer_deletion(layer)); +// } +// if let Some(dst_pos) = maybe_dst_pos { +// if maybe_src_pos.is_none_or(|src_pos| src_pos != dst_pos) { +// let mut writer = shard.writer(); +// writer.update_timestamp(t, dst_pos, eid.with_layer_deletion(layer)); +// } +// } +// } +// } +// +// Ok::<(), MutationError>(()) +// })?; +// println!("Nodes 2 done at {}", Local::now()); +// +// // Copy over graph properties +// { +// let graph_writer = new_storage.graph_props.writer(); +// // Copy temporal properties +// for (prop_name, temporal_prop) in graph.properties().temporal().iter() { +// let prop_id = graph_storage +// .graph_props_meta() +// .temporal_prop_mapper() +// .get_or_create_id(&prop_name) +// .inner(); +// +// for (t, prop_value) in temporal_prop.iter_indexed() { +// graph_writer.add_properties(t, [(prop_id, prop_value)]); +// } +// } +// println!("Temporal graph props done at {}", Local::now()); +// +// // Copy metadata (constant properties) +// let metadata_props: Vec<_> = graph +// .metadata() +// .iter_filtered() +// .map(|(prop_name, prop_value)| { +// let prop_id = graph_storage +// .graph_props_meta() +// .metadata_mapper() +// .get_or_create_id(&prop_name) +// .inner(); +// (prop_id, prop_value) +// }) +// .collect(); +// +// if !metadata_props.is_empty() { +// graph_writer.update_metadata(metadata_props); +// } +// println!("Graph metadata done at {}", Local::now()); +// } +// } +// +// Ok(graph.new_base_graph(graph_storage)) +// } + impl<'graph, G: GraphView + 'graph> GraphViewOps<'graph> for G { fn edges(&self) -> Edges<'graph, Self> { edges_inner(self, true) diff --git a/raphtory/src/db/graph/graph.rs b/raphtory/src/db/graph/graph.rs index 9ae43bf657..d5ce68c806 100644 --- a/raphtory/src/db/graph/graph.rs +++ b/raphtory/src/db/graph/graph.rs @@ -290,10 +290,10 @@ fn normalise_temporal_map( for (k, v) in map { let mut v2: Vec<(i64, Prop)> = v.iter().map(|(t, p)| (t.t(), p.clone())).collect(); - // stable deterministic ordering for same timestamp too + // stable deterministic ordering for events at the same timestamp v2.sort_by(|(t1, p1), (t2, p2)| { t1.cmp(t2) - .then_with(|| format!("{p1:?}").cmp(&format!("{p2:?}"))) + .then_with(|| canonical_prop_repr(p1).cmp(&canonical_prop_repr(p2))) }); out.insert(k.clone(), v2); @@ -302,6 +302,51 @@ fn normalise_temporal_map( out } +/// Render a `Prop` as a string that is invariant to `PropArray` variant +/// (`Vec` vs `Array`) and to `FxHashMap` iteration order (which is undefined). +/// Used as a stable sort key in test assertions. +fn canonical_prop_repr(p: &Prop) -> String { + let mut s = String::new(); + write_canonical_prop(&mut s, p); + s +} + +fn write_canonical_prop(out: &mut String, p: &Prop) { + use std::fmt::Write; + match p { + Prop::List(arr) => { + out.push_str("List(["); + let mut first = true; + for item in arr.iter() { + if !first { + out.push_str(", "); + } + first = false; + write_canonical_prop(out, &item); + } + out.push_str("])"); + } + Prop::Map(m) => { + let mut keys: Vec<&ArcStr> = m.keys().collect(); + keys.sort(); + out.push_str("Map({"); + let mut first = true; + for k in keys { + if !first { + out.push_str(", "); + } + first = false; + let _ = write!(out, "{k:?}: "); + write_canonical_prop(out, &m[k]); + } + out.push_str("})"); + } + other => { + let _ = write!(out, "{other:?}"); + } + } +} + #[track_caller] pub fn assert_node_equal<'graph, G1: GraphViewOps<'graph>, G2: GraphViewOps<'graph>>( n1: NodeView<'graph, G1>, @@ -551,8 +596,8 @@ pub fn assert_nodes_equal_layer< let mut nodes1: Vec<_> = nodes1.collect(); let mut nodes2: Vec<_> = nodes2.collect(); - nodes1.sort(); - nodes2.sort(); + nodes1.par_sort_unstable(); + nodes2.par_sort_unstable(); assert_eq!( nodes1.len(), @@ -560,9 +605,12 @@ pub fn assert_nodes_equal_layer< "mismatched number of nodes{layer_tag}", ); - for (n1, n2) in nodes1.into_iter().zip(nodes2) { - assert_node_equal_layer(n1, n2, layer_tag, persistent, only_timestamps); - } + nodes1 + .into_par_iter() + .zip_eq(nodes2.into_par_iter()) + .for_each(|(n1, n2)| { + assert_node_equal_layer(n1, n2, layer_tag, persistent, only_timestamps) + }); } #[track_caller] @@ -598,150 +646,153 @@ pub fn assert_edges_equal_layer< edges2.len(), "mismatched number of edges{layer_tag}", ); - edges1.sort_by(|e1, e2| e1.id().cmp(&e2.id())); - edges2.sort_by(|e1, e2| e1.id().cmp(&e2.id())); - - for (e1, e2) in edges1.into_iter().zip(edges2) { - assert_eq!(e1.id(), e2.id(), "mismatched edge ids{layer_tag}"); - if persistent || only_timestamps { - assert_eq!( - e1.earliest_time().map(|t| t.t()), - e2.earliest_time().map(|t| t.t()), - "mismatched earliest time for edge {:?}{layer_tag}", - e1.id(), - ); - assert_eq!( - e1.latest_time().map(|t| t.t()), - e2.latest_time().map(|t| t.t()), - "mismatched latest time for edge {:?}{layer_tag}", - e1.id(), - ); - } else { - assert_eq!( - e1.earliest_time(), - e2.earliest_time(), - "mismatched earliest time for edge {:?}{layer_tag}", - e1.id(), - ); - assert_eq!( - e1.latest_time(), - e2.latest_time(), - "mismatched latest time for edge {:?}{layer_tag}", - e1.id(), - ); - } - assert_eq!( - e1.metadata().as_map(), - e2.metadata().as_map(), - "mismatched metadata for edge {:?}{layer_tag}", - e1.id(), - ); - if only_timestamps { + edges1.par_sort_unstable_by(|e1, e2| e1.id().cmp(&e2.id())); + edges2.par_sort_unstable_by(|e1, e2| e1.id().cmp(&e2.id())); + + edges1 + .into_par_iter() + .zip_eq(edges2.into_par_iter()) + .for_each(|(e1, e2)| { + assert_eq!(e1.id(), e2.id(), "mismatched edge ids{layer_tag}"); + if persistent || only_timestamps { + assert_eq!( + e1.earliest_time().map(|t| t.t()), + e2.earliest_time().map(|t| t.t()), + "mismatched earliest time for edge {:?}{layer_tag}", + e1.id(), + ); + assert_eq!( + e1.latest_time().map(|t| t.t()), + e2.latest_time().map(|t| t.t()), + "mismatched latest time for edge {:?}{layer_tag}", + e1.id(), + ); + } else { + assert_eq!( + e1.earliest_time(), + e2.earliest_time(), + "mismatched earliest time for edge {:?}{layer_tag}", + e1.id(), + ); + assert_eq!( + e1.latest_time(), + e2.latest_time(), + "mismatched latest time for edge {:?}{layer_tag}", + e1.id(), + ); + } assert_eq!( - e1.properties() - .temporal() - .iter() - .map(|(key, value)| (key, value.iter().map(|(t, p)| (t.t(), p)).collect())) - .collect::>>(), - e2.properties() - .temporal() - .iter() - .map(|(key, value)| (key, value.iter().map(|(t, p)| (t.t(), p)).collect())) - .collect::>>(), - "mismatched temporal properties for edge {:?}{layer_tag}", + e1.metadata().as_map(), + e2.metadata().as_map(), + "mismatched metadata for edge {:?}{layer_tag}", e1.id(), ); + if only_timestamps { + assert_eq!( + e1.properties() + .temporal() + .iter() + .map(|(key, value)| (key, value.iter().map(|(t, p)| (t.t(), p)).collect())) + .collect::>>(), + e2.properties() + .temporal() + .iter() + .map(|(key, value)| (key, value.iter().map(|(t, p)| (t.t(), p)).collect())) + .collect::>>(), + "mismatched temporal properties for edge {:?}{layer_tag}", + e1.id(), + ); - let mut e1_updates: Vec<_> = e1 - .explode() - .iter() - .map(|e| (e.layer_name().unwrap(), e.time().unwrap().t())) - .collect(); - e1_updates.sort(); + let mut e1_updates: Vec<_> = e1 + .explode() + .iter() + .map(|e| (e.layer_name().unwrap(), e.time().unwrap().t())) + .collect(); + e1_updates.sort(); - let mut e2_updates: Vec<_> = e2 - .explode() - .iter() - .map(|e| (e.layer_name().unwrap(), e.time().unwrap().t())) - .collect(); - e2_updates.sort(); - assert_eq!( - e1_updates, - e2_updates, - "mismatched updates for edge {:?}{layer_tag}", - e1.id(), - ); - } else { - let left = normalise_temporal_map(&e1.properties().temporal().as_map()); - let right = normalise_temporal_map(&e2.properties().temporal().as_map()); + let mut e2_updates: Vec<_> = e2 + .explode() + .iter() + .map(|e| (e.layer_name().unwrap(), e.time().unwrap().t())) + .collect(); + e2_updates.sort(); + assert_eq!( + e1_updates, + e2_updates, + "mismatched updates for edge {:?}{layer_tag}", + e1.id(), + ); + } else { + let left = normalise_temporal_map(&e1.properties().temporal().as_map()); + let right = normalise_temporal_map(&e2.properties().temporal().as_map()); - assert_eq!( - left, - right, - "mismatched temporal properties for edge {:?}{layer_tag}", - e1.id(), - ); + assert_eq!( + left, + right, + "mismatched temporal properties for edge {:?}{layer_tag}", + e1.id(), + ); - let mut e1_updates: Vec<_> = e1 - .explode() - .iter() - .map(|e| (e.layer_name().unwrap(), e.time().unwrap())) - .collect(); - e1_updates.sort(); + let mut e1_updates: Vec<_> = e1 + .explode() + .iter() + .map(|e| (e.layer_name().unwrap(), e.time().unwrap())) + .collect(); + e1_updates.sort(); - let mut e2_updates: Vec<_> = e2 - .explode() - .iter() - .map(|e| (e.layer_name().unwrap(), e.time().unwrap())) - .collect(); - e2_updates.sort(); + let mut e2_updates: Vec<_> = e2 + .explode() + .iter() + .map(|e| (e.layer_name().unwrap(), e.time().unwrap())) + .collect(); + e2_updates.sort(); + assert_eq!( + e1_updates, + e2_updates, + "mismatched updates for edge {:?}{layer_tag}", + e1.id(), + ); + } assert_eq!( - e1_updates, - e2_updates, - "mismatched updates for edge {:?}{layer_tag}", - e1.id(), + e1.is_valid(), + e2.is_valid(), + "mismatched is_valid for edge {:?}{layer_tag}", + e1.id() ); - } - assert_eq!( - e1.is_valid(), - e2.is_valid(), - "mismatched is_valid for edge {:?}{layer_tag}", - e1.id() - ); - if persistent { - let earliest = e1.timeline_start(); - match earliest { - None => { - assert!( - e2.timeline_start().is_none(), - "expected empty timeline for edge {:?}{layer_tag}", - e1.id() - ) - } - Some(earliest) => { - assert_eq!( - e1.after(earliest.t()).is_active(), - e2.after(earliest.t()).is_active(), - "mismatched is_active for edge {:?}{layer_tag}", - e1.id() - ); + if persistent { + let earliest = e1.timeline_start(); + match earliest { + None => { + assert!( + e2.timeline_start().is_none(), + "expected empty timeline for edge {:?}{layer_tag}", + e1.id() + ) + } + Some(earliest) => { + assert_eq!( + e1.after(earliest.t()).is_active(), + e2.after(earliest.t()).is_active(), + "mismatched is_active for edge {:?}{layer_tag}", + e1.id() + ); + } } + } else { + assert_eq!( + e1.is_active(), + e2.is_active(), + "mismatched is_active for edge {:?}{layer_tag}", + e1.id() + ); } - } else { assert_eq!( - e1.is_active(), - e2.is_active(), - "mismatched is_active for edge {:?}{layer_tag}", + e1.is_deleted(), + e2.is_deleted(), + "mismatched is_deleted for edge {:?}{layer_tag}", e1.id() ); - } - assert_eq!( - e1.is_deleted(), - e2.is_deleted(), - "mismatched is_deleted for edge {:?}{layer_tag}", - e1.id() - ); - } + }); } #[track_caller] @@ -844,17 +895,17 @@ fn assert_graph_equal_inner<'graph, G1: GraphViewOps<'graph>, G2: GraphViewOps<' left_layers, right_layers ); - for layer in left_layers { + left_layers.par_iter().for_each(|layer| { assert_graph_equal_layer( &g1.layers(layer.deref()) .unwrap_or_else(|_| panic!("Left graph missing layer {layer})")), &g2.layers(layer.deref()) .unwrap_or_else(|_| panic!("Right graph missing layer {layer}")), - Some(&layer), + Some(layer), persistent, only_timestamps, ); - } + }); }) } diff --git a/raphtory/src/io/mod.rs b/raphtory/src/io/mod.rs index e2e8290fa5..cad0a76652 100644 --- a/raphtory/src/io/mod.rs +++ b/raphtory/src/io/mod.rs @@ -1,17 +1,5 @@ -use std::sync::LazyLock; - -use rayon::{ThreadPool, ThreadPoolBuilder}; - -pub mod arrow; pub mod csv_loader; pub mod json_loader; pub mod neo4j_loader; pub mod parquet_loaders; - -static LOAD_POOL: LazyLock = LazyLock::new(|| { - ThreadPoolBuilder::new() - .thread_name(|idx| format!("PS Bulk Load Thread-{idx}")) - .build() - .unwrap() -}); diff --git a/raphtory/src/io/parquet_loaders.rs b/raphtory/src/io/parquet_loaders.rs index 7c7b47a185..f43647bb2f 100644 --- a/raphtory/src/io/parquet_loaders.rs +++ b/raphtory/src/io/parquet_loaders.rs @@ -1,7 +1,5 @@ use crate::{ - db::api::view::StaticGraphViewOps, - errors::GraphError, - io::arrow::{ + arrow_loader::{ dataframe::*, df_loaders::{ edges::{load_edges_from_df_prefetch, ColumnNames}, @@ -9,6 +7,8 @@ use crate::{ *, }, }, + db::api::view::StaticGraphViewOps, + errors::GraphError, prelude::{AdditionOps, DeletionOps, PropertyAdditionOps}, }; use arrow::{ @@ -242,6 +242,7 @@ pub fn load_node_metadata_from_parquet< metadata_properties, shared_metadata, graph, + false, )?; } diff --git a/raphtory/src/lib.rs b/raphtory/src/lib.rs index 73338ee569..f5f21ceacc 100644 --- a/raphtory/src/lib.rs +++ b/raphtory/src/lib.rs @@ -106,8 +106,10 @@ pub mod vectors; pub mod io; pub mod api; +pub mod arrow_loader; pub mod core; pub mod errors; +pub mod parquet_encoder; #[cfg(feature = "io")] pub mod serialise; pub mod storage; diff --git a/raphtory/src/parquet_encoder/edges.rs b/raphtory/src/parquet_encoder/edges.rs new file mode 100644 index 0000000000..92956e3b93 --- /dev/null +++ b/raphtory/src/parquet_encoder/edges.rs @@ -0,0 +1,228 @@ +use super::*; +use crate::{ + db::{ + api::state::ops::{FilterOps, GraphView}, + graph::edge::EdgeView, + }, + errors::GraphError, + parquet_encoder::model::ParquetDelEdge, +}; +use arrow::datatypes::{DataType, Field}; +use model::ParquetCEdge; +use raphtory_api::core::entities::{LayerId, LayerIds}; +use raphtory_storage::graph::edges::{edge_storage_ops::EdgeStorageOps, edges::EdgesStorageRef}; + +fn get_edges_par_iter<'a, G: GraphView>( + g: &'a G, + edges_locked: &'a EdgesStorageRef, + layer_filter: &'a LayerIds, +) -> impl ParallelIterator> + 'a)> { + let filtered = g.filtered(); + + edges_locked + .segmented_par_iter() + .expect("Internal Error: segmented_par_iter cannot be called from unlocked GraphStorage") + .map(move |(chunk, eids)| { + ( + chunk, + eids.filter_map(move |eid| { + let edge = g.core_edge(eid); + if !edge.as_ref().has_layer(layer_filter) { + return None; + } + if !filtered || g.filter_edge(edge.as_ref()) { + let edge_ref = edge.out_ref(); + Some(EdgeView::new(g, edge_ref)) + } else { + None + } + }), + ) + }) +} + +fn active_layers(g: &G) -> Vec { + match g.layer_ids() { + LayerIds::None => vec![], + LayerIds::All => g.edge_meta().layer_meta().ids().map(LayerId).collect(), + LayerIds::One(id) => vec![*id], + LayerIds::Multiple(ids) => ids.iter().collect(), + } +} + +pub(crate) fn encode_edge_tprop( + g: &G, + sink_factory_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, +) -> Result<(), GraphError> { + let graph_locked = g.core_graph().lock(); + let edges_locked = graph_locked.edges(); + // if we go layer by layer, we save a lot of disk space for graphs saved on disk + for layer_id in active_layers(g) { + let layer_filter = LayerIds::One(layer_id); + run_encode_indexed( + g, + g.edge_meta().temporal_prop_mapper(), + get_edges_par_iter(g, &edges_locked, &layer_filter), + |schema, chunk, num_digits| sink_factory_fn(schema, chunk, num_digits), + |id_type| { + vec![ + Field::new(TIME_COL, DataType::Int64, false), + Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), + Field::new(SRC_COL_VID, DataType::UInt64, false), + Field::new(SRC_COL_ID, id_type.clone(), false), + Field::new(DST_COL_VID, DataType::UInt64, false), + Field::new(DST_COL_ID, id_type.clone(), false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), + Field::new(LAYER_COL, DataType::Utf8, true), + Field::new(LAYER_ID_COL, DataType::UInt64, true), + ] + }, + |edges, _g, decoder, sink| { + for edge_rows in edges + .into_iter() + .flat_map(|e| e.explode()) + .filter(|edge| edge.edge.layer() == Some(layer_id)) + .map(|edge| { + let (export_src_id, export_dst_id) = edge.id(); + ParquetTEdge { + edge, + export_src_vid: edge.src().node.0, + export_src_id, + export_dst_vid: edge.dst().node.0, + export_dst_id, + export_eid: edge.edge.pid(), + export_layer_id: edge.edge.layer().map(|l| l.0), + } + }) + .chunks(ROW_GROUP_SIZE) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&edge_rows)?; + if let Some(rb) = decoder.flush()? { + RecordBatchSink::send_batch(sink, rb)?; + } + } + Ok(()) + }, + )?; + } + Ok(()) +} + +pub(crate) fn encode_edge_deletions( + g: &G, + sink_factory_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, +) -> Result<(), GraphError> { + let graph_locked = g.core_graph().lock(); + let edges_locked = graph_locked.edges(); + for layer_id in active_layers(g) { + let layer_filter = LayerIds::One(layer_id); + run_encode_indexed( + g, + g.edge_meta().temporal_prop_mapper(), + get_edges_par_iter(g, &edges_locked, &layer_filter), + |schema, chunk, num_digits| sink_factory_fn(schema, chunk, num_digits), + |id_type| { + vec![ + Field::new(TIME_COL, DataType::Int64, false), + Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), + Field::new(SRC_COL_VID, DataType::UInt64, false), + Field::new(SRC_COL_ID, id_type.clone(), false), + Field::new(DST_COL_VID, DataType::UInt64, false), + Field::new(DST_COL_ID, id_type.clone(), false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), + Field::new(LAYER_COL, DataType::Utf8, true), + Field::new(LAYER_ID_COL, DataType::UInt64, true), + ] + }, + |edges, _g, decoder, sink| { + for edge_rows in edges + .into_iter() + .flat_map(|e| e.explode_layers()) + .filter(|edge| edge.edge.layer() == Some(layer_id)) + .flat_map(|edge| { + edge.deletions().into_iter().map(move |deletion| { + let (export_src_id, export_dst_id) = edge.id(); + ParquetDelEdge { + edge, + del: deletion, + export_src_vid: edge.src().node.0, + export_src_id, + export_dst_vid: edge.dst().node.0, + export_dst_id, + export_eid: edge.edge.pid().0, + export_layer_id: edge.edge.layer().map(|l| l.0), + } + }) + }) + .chunks(ROW_GROUP_SIZE) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&edge_rows)?; + if let Some(rb) = decoder.flush()? { + RecordBatchSink::send_batch(sink, rb)?; + } + } + Ok(()) + }, + )?; + } + Ok(()) +} + +pub(crate) fn encode_edge_cprop( + g: &G, + sink_factory_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, +) -> Result<(), GraphError> { + let graph_locked = g.core_graph().lock(); + let edges_locked = graph_locked.edges(); + for layer_id in active_layers(g) { + let layer_filter = LayerIds::One(layer_id); + run_encode_indexed( + g, + g.edge_meta().metadata_mapper(), + get_edges_par_iter(g, &edges_locked, &layer_filter), + |schema, chunk, num_digits| sink_factory_fn(schema, chunk, num_digits), + |id_type| { + vec![ + Field::new(SRC_COL_VID, DataType::UInt64, false), + Field::new(SRC_COL_ID, id_type.clone(), false), + Field::new(DST_COL_VID, DataType::UInt64, false), + Field::new(DST_COL_ID, id_type.clone(), false), + Field::new(EDGE_COL_ID, DataType::UInt64, false), + Field::new(LAYER_COL, DataType::Utf8, true), + ] + }, + |edges, _g, decoder, sink| { + for edge_rows in edges + .into_iter() + .flat_map(|e| e.explode_layers().into_iter()) + .filter(|edge| edge.edge.layer() == Some(layer_id)) + .map(|edge| { + let (export_src_id, export_dst_id) = edge.id(); + ParquetCEdge { + edge, + export_src_vid: edge.src().node.0, + export_src_id, + export_dst_vid: edge.dst().node.0, + export_dst_id, + export_eid: edge.edge.pid().0, + } + }) + .chunks(ROW_GROUP_SIZE) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&edge_rows)?; + if let Some(rb) = decoder.flush()? { + RecordBatchSink::send_batch(sink, rb)?; + } + } + Ok(()) + }, + )?; + } + Ok(()) +} diff --git a/raphtory/src/serialise/parquet/graph.rs b/raphtory/src/parquet_encoder/graph.rs similarity index 68% rename from raphtory/src/serialise/parquet/graph.rs rename to raphtory/src/parquet_encoder/graph.rs index e5349bdd30..f448072fc4 100644 --- a/raphtory/src/serialise/parquet/graph.rs +++ b/raphtory/src/parquet_encoder/graph.rs @@ -1,37 +1,32 @@ use crate::{ + db::api::state::ops::GraphView, errors::GraphError, + parquet_encoder::{run_encode, RecordBatchSink, SECONDARY_INDEX_COL, TIME_COL}, prelude::{GraphViewOps, Prop, PropertiesOps}, - serialise::parquet::{ - run_encode, EVENT_GRAPH_TYPE, GRAPH_C_PATH, GRAPH_TYPE, GRAPH_T_PATH, - PERSISTENT_GRAPH_TYPE, SECONDARY_INDEX_COL, TIME_COL, - }, }; -use arrow::datatypes::{DataType, Field}; +use arrow::datatypes::{DataType, Field, SchemaRef}; use itertools::Itertools; -use parquet::file::metadata::KeyValue; -use raphtory_api::{ - core::{entities::properties::prop::SerdeArrowProp, storage::arc_str::ArcStr}, - GraphType, -}; +use raphtory_api::core::{entities::properties::prop::SerdeArrowProp, storage::arc_str::ArcStr}; use raphtory_core::storage::timeindex::EventTime; -use raphtory_storage::graph::graph::GraphStorage; use serde::{ser::SerializeMap, Serialize}; -use std::{collections::HashMap, path::Path}; +use std::collections::HashMap; -pub fn encode_graph_tprop(g: &GraphStorage, path: impl AsRef) -> Result<(), GraphError> { +pub fn encode_graph_tprop( + g: &G, + sink_factory_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, +) -> Result<(), GraphError> { run_encode( g, g.graph_props_meta().temporal_prop_mapper(), 1, - path, - GRAPH_T_PATH, + sink_factory_fn, |_| { vec![ Field::new(TIME_COL, DataType::Int64, false), Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), ] }, - |_, g, decoder, writer| { + |_, g, decoder, sink| { // Collect into owned props here to avoid lifetime issues on prop_view. // Ideally we want to be returning refs to the props but this // is not possible with the current API. @@ -65,8 +60,7 @@ pub fn encode_graph_tprop(g: &GraphStorage, path: impl AsRef) -> Result<() decoder.serialize(&rows)?; if let Some(rb) = decoder.flush()? { - writer.write(&rb)?; - writer.flush()?; + RecordBatchSink::send_batch(sink, rb)?; } Ok(()) @@ -98,19 +92,17 @@ impl Serialize for Row { } } -pub fn encode_graph_cprop( - g: &GraphStorage, - graph_type: GraphType, - path: impl AsRef, +pub fn encode_graph_cprop( + g: &G, + sink_factory_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, ) -> Result<(), GraphError> { run_encode( g, g.graph_props_meta().metadata_mapper(), 1, - path, - GRAPH_C_PATH, + sink_factory_fn, |_| vec![Field::new(TIME_COL, DataType::Int64, true)], - |_, g, decoder, writer| { + |_, g, decoder, sink| { let row = g.metadata().as_map(); let time = EventTime::new(0, 0); // const props don't have time let rows = vec![Row { t: time, row }]; @@ -118,21 +110,9 @@ pub fn encode_graph_cprop( decoder.serialize(&rows)?; if let Some(rb) = decoder.flush()? { - writer.write(&rb)?; - writer.flush()?; + RecordBatchSink::send_batch(sink, rb)?; } - match graph_type { - GraphType::EventGraph => writer.append_key_value_metadata(KeyValue::new( - GRAPH_TYPE.to_string(), - Some(EVENT_GRAPH_TYPE.to_string()), - )), - GraphType::PersistentGraph => writer.append_key_value_metadata(KeyValue::new( - GRAPH_TYPE.to_string(), - Some(PERSISTENT_GRAPH_TYPE.to_string()), - )), - }; - Ok(()) }, ) diff --git a/raphtory/src/parquet_encoder/mod.rs b/raphtory/src/parquet_encoder/mod.rs new file mode 100644 index 0000000000..b70c6d30a0 --- /dev/null +++ b/raphtory/src/parquet_encoder/mod.rs @@ -0,0 +1,160 @@ +use crate::{ + db::api::state::ops::GraphView, errors::GraphError, parquet_encoder::model::get_id_type, + prelude::*, +}; +use arrow::{ + array::RecordBatch, + datatypes::{DataType, Field, Schema, SchemaRef}, +}; +use arrow_json::{reader::Decoder, ReaderBuilder}; +use itertools::Itertools; +use model::ParquetTEdge; +use raphtory_api::core::entities::{ + properties::{meta::PropMapper, prop::arrow_dtype_from_prop_type}, + GidType, +}; +use rayon::{prelude::*, ThreadPool, ThreadPoolBuilder}; +use std::{ops::Range, sync::LazyLock}; + +mod edges; +mod model; +mod nodes; + +mod graph; + +pub(crate) use edges::{encode_edge_cprop, encode_edge_deletions, encode_edge_tprop}; +pub(crate) use graph::{encode_graph_cprop, encode_graph_tprop}; +pub(crate) use nodes::{encode_nodes_cprop, encode_nodes_tprop}; + +const ROW_GROUP_SIZE: usize = 100_000; +pub(crate) const NODE_ID_COL: &str = "rap_node_id"; +pub(crate) const NODE_VID_COL: &str = "rap_node_vid"; +pub(crate) const TYPE_COL: &str = "rap_node_type"; +pub(crate) const TYPE_ID_COL: &str = "rap_node_type_id"; +pub(crate) const TIME_COL: &str = "rap_time"; +pub(crate) const SECONDARY_INDEX_COL: &str = "rap_secondary_index"; +pub(crate) const SRC_COL_VID: &str = "rap_src_vid"; +pub(crate) const DST_COL_VID: &str = "rap_dst_vid"; +pub(crate) const SRC_COL_ID: &str = "rap_src_id"; +pub(crate) const DST_COL_ID: &str = "rap_dst_id"; +pub(crate) const EDGE_COL_ID: &str = "rap_edge_id"; +pub(crate) const LAYER_COL: &str = "rap_layer"; +pub(crate) const LAYER_ID_COL: &str = "rap_layer_id"; + +pub trait RecordBatchSink { + fn send_batch(&mut self, batch: RecordBatch) -> Result<(), GraphError>; + fn finish(self) -> Result<(), GraphError> + where + Self: Sized; +} + +pub(crate) fn run_encode( + g: &G, + meta: &PropMapper, + size: usize, + make_sink_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, + default_fields_fn: impl Fn(&DataType) -> Vec, + encode_fn: impl Fn(Range, &G, &mut Decoder, &mut S) -> Result<(), GraphError> + Sync, +) -> Result<(), GraphError> { + let schema = derive_schema(meta, g.id_type(), default_fields_fn)?; + + if size > 0 { + let chunk_size = (size / rayon::current_num_threads()).max(128); + let iter = (0..size).into_par_iter().step_by(chunk_size); + + let num_digits = iter.len().to_string().len(); + + iter.enumerate().try_for_each(|(chunk, first)| { + let items = first..(first + chunk_size).min(size); + + let mut sink = make_sink_fn(schema.clone(), chunk, num_digits)?; + let mut decoder = ReaderBuilder::new(schema.clone()).build_decoder()?; + + encode_fn(items, g, &mut decoder, &mut sink)?; + + sink.finish()?; + Ok::<_, GraphError>(()) + })?; + } + Ok(()) +} + +pub(crate) fn run_encode_indexed< + Index, + II: Iterator, + G: GraphView, + S: RecordBatchSink, +>( + g: &G, + meta: &PropMapper, + items: impl ParallelIterator, + make_sink_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, + default_fields_fn: impl Fn(&DataType) -> Vec, + encode_fn: impl Fn(II, &G, &mut Decoder, &mut S) -> Result<(), GraphError> + Sync, +) -> Result<(), GraphError> { + let schema = derive_schema(meta, g.id_type(), default_fields_fn)?; + let num_digits = 8; + + items.try_for_each(|(chunk, items)| { + let mut sink = make_sink_fn(schema.clone(), chunk, num_digits)?; + let mut decoder = ReaderBuilder::new(schema.clone()).build_decoder()?; + + encode_fn(items, g, &mut decoder, &mut sink)?; + + sink.finish()?; + Ok::<_, GraphError>(()) + })?; + + Ok(()) +} + +pub(crate) fn derive_schema( + prop_meta: &PropMapper, + id_type: Option, + default_fields_fn: impl Fn(&DataType) -> Vec, +) -> Result { + let fields = arrow_fields(prop_meta); + let id_type = get_id_type(id_type); + + let make_schema = |id_type: DataType, prop_columns: Vec| { + let default_fields = default_fields_fn(&id_type); + + Schema::new( + default_fields + .into_iter() + .chain(prop_columns) + .collect::>(), + ) + .into() + }; + + let schema = if let Ok(id_type) = id_type { + make_schema(id_type, fields) + } else { + make_schema(DataType::UInt64, fields) + }; + + Ok(schema) +} + +fn arrow_fields(meta: &PropMapper) -> Vec { + meta.keys() + .iter() + .zip(meta.ids()) + .filter_map(|(name, prop_id)| { + meta.get_dtype(prop_id) + .map(move |prop_type| (name, prop_type)) + }) + .map(|(name, prop_type)| { + let d_type = arrow_dtype_from_prop_type(&prop_type); + Field::new(name, d_type, true) + }) + .collect() +} + +pub(crate) static ENCODE_POOL: LazyLock = LazyLock::new(|| { + ThreadPoolBuilder::new() + .thread_name(|idx| format!("PS Encode Thread-{idx}")) + .build() + .unwrap() +}); diff --git a/raphtory/src/parquet_encoder/model.rs b/raphtory/src/parquet_encoder/model.rs new file mode 100644 index 0000000000..6c348d2bf5 --- /dev/null +++ b/raphtory/src/parquet_encoder/model.rs @@ -0,0 +1,256 @@ +use super::{ + Prop, DST_COL_ID, LAYER_COL, NODE_ID_COL, SECONDARY_INDEX_COL, SRC_COL_ID, TIME_COL, TYPE_COL, +}; +use crate::{ + db::{ + api::view::internal::GraphView, + graph::{edge::EdgeView, node::NodeView}, + }, + parquet_encoder::{ + DST_COL_VID, EDGE_COL_ID, LAYER_ID_COL, NODE_VID_COL, SRC_COL_VID, TYPE_ID_COL, + }, + prelude::*, +}; +use arrow::datatypes::DataType; +use raphtory_api::core::{ + entities::{ + properties::{prop::SerdeArrowProp, tprop::TPropOps}, + GidType, LayerId, EID, + }, + storage::{ + arc_str::ArcStr, + timeindex::{EventTime, TimeIndexOps}, + }, +}; +use raphtory_storage::graph::edges::edge_storage_ops::EdgeStorageOps; +use serde::{ + ser::{Error, SerializeMap}, + Serialize, +}; + +#[derive(Debug)] +struct ParquetGID<'a>(&'a GID); + +impl Serialize for ParquetGID<'_> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + match self.0 { + GID::U64(id) => serializer.serialize_u64(*id), + GID::Str(id) => serializer.serialize_str(id), + } + } +} + +#[derive(Debug)] +pub(crate) struct ParquetTEdge<'a, G: GraphView> { + pub(crate) edge: EdgeView<&'a G>, + pub(crate) export_src_vid: usize, + pub(crate) export_src_id: GID, + pub(crate) export_dst_vid: usize, + pub(crate) export_dst_id: GID, + pub(crate) export_eid: EID, + pub(crate) export_layer_id: Option, +} + +impl<'a, G: GraphView> Serialize for ParquetTEdge<'a, G> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let edge = &self.edge; + let mut state = serializer.serialize_map(None)?; + let t = edge + .edge + .time() + .ok_or(S::Error::custom("Edge has no time"))?; + let layer = edge + .layer_name() + .map_err(|_| S::Error::custom("Edge has no layer"))?; + + let layer_id = self + .export_layer_id + .ok_or_else(|| S::Error::custom("Edge has no layer"))?; + + state.serialize_entry(TIME_COL, &t.0)?; + state.serialize_entry(SECONDARY_INDEX_COL, &t.1)?; + state.serialize_entry(SRC_COL_VID, &self.export_src_vid)?; + state.serialize_entry(SRC_COL_ID, &ParquetGID(&self.export_src_id))?; + state.serialize_entry(DST_COL_VID, &self.export_dst_vid)?; + state.serialize_entry(DST_COL_ID, &ParquetGID(&self.export_dst_id))?; + state.serialize_entry(EDGE_COL_ID, &self.export_eid)?; + state.serialize_entry(LAYER_COL, &layer)?; + state.serialize_entry(LAYER_ID_COL, &layer_id)?; + + // Emit temporal props for this exploded event. Two cases: + // * real addition at exactly `t` in storage — use `at(t)` so we only + // emit props actually set at that time (additions without new prop + // values should not inherit persisted values, otherwise later + // reloads see spurious duplicates). + // * synthetic event (windowed persistent graph emits an event at + // `w.start` for edges that persist into the window) — fall back to + // the view's persistent semantics via `latest` to emit the + // persisted prop values. + let core = edge.graph.core_edge(edge.edge.pid()); + let core_ref = core.as_ref(); + let layer = LayerId(layer_id); + let t_next = EventTime::start(t.0.saturating_add(1)); + let has_real_addition = core_ref.additions(layer).active(t..t_next); + let mapper = edge.graph.edge_meta().temporal_prop_mapper(); + let temporal = edge.properties().temporal(); + for prop_id in mapper.ids() { + let prop = if has_real_addition { + core_ref.temporal_prop_layer(layer, prop_id).at(&t) + } else { + temporal.get_by_id(prop_id).and_then(|v| v.latest()) + }; + if let Some(prop) = prop { + let name = mapper.get_name(prop_id); + state.serialize_entry(&name, &SerdeArrowProp(&prop))?; + } + } + + state.end() + } +} + +#[derive(Debug)] +pub(crate) struct ParquetCEdge<'a, G: GraphView> { + pub(crate) edge: EdgeView<&'a G>, + pub(crate) export_src_vid: usize, + pub(crate) export_src_id: GID, + pub(crate) export_dst_vid: usize, + pub(crate) export_dst_id: GID, + pub(crate) export_eid: usize, +} + +impl<'a, G: GraphView> Serialize for ParquetCEdge<'a, G> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let edge = &self.edge; + let mut state = serializer.serialize_map(None)?; + let layer = edge + .layer_name() + .map_err(|_| S::Error::custom("Edge has no layer"))?; + + state.serialize_entry(SRC_COL_VID, &self.export_src_vid)?; + state.serialize_entry(SRC_COL_ID, &ParquetGID(&self.export_src_id))?; + state.serialize_entry(DST_COL_VID, &self.export_dst_vid)?; + state.serialize_entry(DST_COL_ID, &ParquetGID(&self.export_dst_id))?; + state.serialize_entry(EDGE_COL_ID, &self.export_eid)?; + state.serialize_entry(LAYER_COL, &layer)?; + + for (name, prop) in edge.metadata().iter_filtered() { + state.serialize_entry(&name, &SerdeArrowProp(&prop))?; + } + + state.end() + } +} + +pub(crate) struct ParquetDelEdge<'a, G: GraphView> { + pub(crate) edge: EdgeView<&'a G>, + pub(crate) del: EventTime, + pub(crate) export_src_vid: usize, + pub(crate) export_src_id: GID, + pub(crate) export_dst_vid: usize, + pub(crate) export_dst_id: GID, + pub(crate) export_eid: usize, + pub(crate) export_layer_id: Option, +} + +impl<'a, G: GraphView> Serialize for ParquetDelEdge<'a, G> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_map(None)?; + + let layer_id = self + .export_layer_id + .ok_or_else(|| S::Error::custom("Edge has no layer"))?; + let layer = self + .edge + .layer_name() + .map_err(|_| S::Error::custom("Edge has no layer"))?; + + state.serialize_entry(TIME_COL, &self.del.0)?; + state.serialize_entry(SECONDARY_INDEX_COL, &self.del.1)?; + state.serialize_entry(SRC_COL_VID, &self.export_src_vid)?; + state.serialize_entry(SRC_COL_ID, &ParquetGID(&self.export_src_id))?; + state.serialize_entry(DST_COL_VID, &self.export_dst_vid)?; + state.serialize_entry(DST_COL_ID, &ParquetGID(&self.export_dst_id))?; + state.serialize_entry(EDGE_COL_ID, &self.export_eid)?; + state.serialize_entry(LAYER_COL, &layer)?; + state.serialize_entry(LAYER_ID_COL, &layer_id)?; + + state.end() + } +} + +pub(crate) struct ParquetTNode<'a> { + pub export_id: GID, + pub export_vid: usize, + pub export_node_type: Option, + pub cols: &'a [ArcStr], + pub t: EventTime, + pub props: Vec<(usize, Prop)>, +} + +impl<'a> Serialize for ParquetTNode<'a> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_map(None)?; + + state.serialize_entry(NODE_ID_COL, &ParquetGID(&self.export_id))?; + state.serialize_entry(NODE_VID_COL, &self.export_vid)?; + state.serialize_entry(TYPE_COL, &self.export_node_type)?; + state.serialize_entry(TIME_COL, &self.t.0)?; + state.serialize_entry(SECONDARY_INDEX_COL, &self.t.1)?; + + for (name, prop) in self.props.iter() { + state.serialize_entry(&self.cols[*name], &SerdeArrowProp(prop))?; + } + + state.end() + } +} + +pub(crate) struct ParquetCNode<'a, G: GraphView> { + pub node: NodeView<'a, &'a G>, + pub export_vid: usize, + pub export_node_type_id: usize, +} + +impl<'a, G: GraphView> Serialize for ParquetCNode<'a, G> { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_map(None)?; + + state.serialize_entry(NODE_ID_COL, &ParquetGID(&self.node.id()))?; + state.serialize_entry(NODE_VID_COL, &self.export_vid)?; + state.serialize_entry(TYPE_COL, &self.node.node_type())?; + state.serialize_entry(TYPE_ID_COL, &self.export_node_type_id)?; + + for (name, prop) in self.node.metadata().iter_filtered() { + state.serialize_entry(&name, &SerdeArrowProp(&prop))?; + } + + state.end() + } +} + +pub(crate) fn get_id_type(id_type: Option) -> Result { + match id_type { + Some(GidType::Str) => Ok(DataType::Utf8), + Some(GidType::U64) => Ok(DataType::UInt64), + None => Err(DataType::UInt64), // The graph is empty what now? + } +} diff --git a/raphtory/src/parquet_encoder/nodes.rs b/raphtory/src/parquet_encoder/nodes.rs new file mode 100644 index 0000000000..2c3309f59b --- /dev/null +++ b/raphtory/src/parquet_encoder/nodes.rs @@ -0,0 +1,140 @@ +use crate::{ + core::utils::iter::GenLockedIter, + db::{ + api::state::ops::{FilterOps, GraphView}, + graph::node::NodeView, + }, + errors::GraphError, + parquet_encoder::{ + model::{ParquetCNode, ParquetTNode}, + run_encode_indexed, RecordBatchSink, NODE_ID_COL, NODE_VID_COL, ROW_GROUP_SIZE, + SECONDARY_INDEX_COL, TIME_COL, TYPE_COL, TYPE_ID_COL, + }, + prelude::NodeViewOps, +}; +use arrow::datatypes::{DataType, Field, SchemaRef}; +use itertools::Itertools; +use raphtory_api::iter::IntoDynBoxed; +use raphtory_storage::graph::nodes::nodes_ref::NodesStorageEntry; +use rayon::iter::ParallelIterator; + +fn get_nodes_par_iter<'a, G: GraphView>( + g: &'a G, + nodes_locked: &'a NodesStorageEntry, +) -> impl ParallelIterator> + 'a)> { + let filtered = g.filtered(); + + nodes_locked + .row_groups_par_iter() + .map(move |(chunk, vids)| { + ( + chunk, + vids.filter_map(move |vid| { + let node = g.core_node(vid); + if !filtered || g.filter_node(node.as_ref()) { + Some(NodeView::new_internal(g, vid)) + } else { + None + } + }), + ) + }) +} + +pub(crate) fn encode_nodes_tprop( + g: &G, + sink_factory_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, +) -> Result<(), GraphError> { + let graph_locked = g.core_graph().lock(); + let nodes_locked = graph_locked.nodes(); + run_encode_indexed( + g, + g.node_meta().temporal_prop_mapper(), + get_nodes_par_iter(g, &nodes_locked), + sink_factory_fn, + |id_type| { + vec![ + Field::new(NODE_ID_COL, id_type.clone(), false), + Field::new(NODE_VID_COL, DataType::UInt64, false), + Field::new(TYPE_COL, DataType::Utf8, true), + Field::new(TIME_COL, DataType::Int64, false), + Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), + ] + }, + |nodes, g, decoder, sink| { + let nodes = nodes.collect::>(); + let nodes = nodes.into_iter(); + + let cols = g.node_meta().temporal_prop_mapper().all_keys(); + let cols = &cols; + for node_rows in nodes + .flat_map(move |node| { + GenLockedIter::from(node, |node| { + node.rows() + .map(|(t, _, props)| ParquetTNode { + export_id: node.id(), + export_vid: node.node.0, + export_node_type: node.node_type(), + cols, + t, + props, + }) + .into_dyn_boxed() + }) + }) + .chunks(ROW_GROUP_SIZE) + .into_iter() + .map(|chunk| chunk.collect_vec()) + { + decoder.serialize(&node_rows)?; + if let Some(rb) = decoder.flush()? { + RecordBatchSink::send_batch(sink, rb)?; + } + } + Ok(()) + }, + ) +} + +pub(crate) fn encode_nodes_cprop( + g: &G, + sink_factory_fn: impl Fn(SchemaRef, usize, usize) -> Result + Sync, +) -> Result<(), GraphError> { + let graph_locked = g.core_graph().lock(); + let nodes_locked = graph_locked.nodes(); + run_encode_indexed( + g, + g.node_meta().metadata_mapper(), + get_nodes_par_iter(g, &nodes_locked), + sink_factory_fn, + |id_type| { + vec![ + Field::new(NODE_ID_COL, id_type.clone(), false), + Field::new(NODE_VID_COL, DataType::UInt64, false), + Field::new(TYPE_COL, DataType::Utf8, true), + Field::new(TYPE_ID_COL, DataType::UInt64, true), + ] + }, + |nodes, _g, decoder, sink| { + for node_rows in nodes + .map(move |node| ParquetCNode { + node, + export_vid: node.node.0, + export_node_type_id: node.node_type_id(), + }) + .chunks(ROW_GROUP_SIZE) + .into_iter() + .map(|chunk| chunk.collect_vec()) + // scope for the decoder + { + decoder.serialize(&node_rows)?; + + if let Some(rb) = decoder.flush()? { + RecordBatchSink::send_batch(sink, rb)?; + } + } + + Ok(()) + }, + ) +} diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 40d939d9be..f6616d2086 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -7,12 +7,13 @@ use crate::python::graph::index::PyIndexSpec; use crate::{ algorithms::components::LargestConnectedComponent, + arrow_loader::df_loaders::edges::ColumnNames, db::{ api::view::internal::{DynamicGraph, IntoDynamic, MaterializedGraph}, graph::{edge::EdgeView, node::NodeView, views::node_subgraph::NodeSubgraph}, }, errors::GraphError, - io::{arrow::df_loaders::edges::ColumnNames, parquet_loaders::*}, + io::parquet_loaders::*, prelude::*, python::{ config::PyConfig, diff --git a/raphtory/src/python/graph/graph_with_deletions.rs b/raphtory/src/python/graph/graph_with_deletions.rs index 353cea65e9..ec4b7dea4d 100644 --- a/raphtory/src/python/graph/graph_with_deletions.rs +++ b/raphtory/src/python/graph/graph_with_deletions.rs @@ -12,7 +12,7 @@ use crate::{ graph::{edge::EdgeView, node::NodeView, views::deletion_graph::PersistentGraph}, }, errors::GraphError, - io::{arrow::df_loaders::edges::ColumnNames, parquet_loaders::*}, + io::parquet_loaders::*, prelude::{DeletionOps, GraphViewOps, ImportOps, ParquetEncoder}, python::{ graph::{ @@ -49,7 +49,7 @@ use std::{ sync::Arc, }; -use crate::python::config::PyConfig; +use crate::{arrow_loader::df_loaders::edges::ColumnNames, python::config::PyConfig}; #[cfg(feature = "search")] use crate::{prelude::IndexMutationOps, python::graph::index::PyIndexSpec}; diff --git a/raphtory/src/python/graph/io/arrow_loaders.rs b/raphtory/src/python/graph/io/arrow_loaders.rs index b20c8d6290..d8b514ccfe 100644 --- a/raphtory/src/python/graph/io/arrow_loaders.rs +++ b/raphtory/src/python/graph/io/arrow_loaders.rs @@ -1,18 +1,16 @@ use crate::{ - db::api::view::StaticGraphViewOps, - errors::GraphError, - io::{ - arrow::{ - dataframe::{DFChunk, DFView}, - df_loaders::{ - edges::{load_edges_from_df_prefetch, ColumnNames}, - load_edge_deletions_from_df_prefetch, load_edges_props_from_df_prefetch, - load_graph_props_from_df, - nodes::{load_node_props_from_df, load_nodes_from_df}, - }, + arrow_loader::{ + dataframe::{DFChunk, DFView}, + df_loaders::{ + edges::{load_edges_from_df_prefetch, ColumnNames}, + load_edge_deletions_from_df_prefetch, load_edges_props_from_df_prefetch, + load_graph_props_from_df, + nodes::{load_node_props_from_df, load_nodes_from_df}, }, - parquet_loaders::cast_columns, }, + db::api::view::StaticGraphViewOps, + errors::GraphError, + io::parquet_loaders::cast_columns, prelude::{AdditionOps, PropertyAdditionOps}, }; use arrow::{ @@ -182,6 +180,7 @@ pub(crate) fn load_node_metadata_from_arrow_c_stream< metadata, shared_metadata, graph, + false, ) }) } @@ -590,6 +589,7 @@ pub(crate) fn load_node_metadata_from_csv_path< metadata, shared_metadata, graph, + false, ) } diff --git a/raphtory/src/serialise/parquet/mod.rs b/raphtory/src/serialise/parquet.rs similarity index 70% rename from raphtory/src/serialise/parquet/mod.rs rename to raphtory/src/serialise/parquet.rs index c0248d9aeb..ec9df2f2b4 100644 --- a/raphtory/src/serialise/parquet/mod.rs +++ b/raphtory/src/serialise/parquet.rs @@ -1,68 +1,46 @@ use crate::{ + arrow_loader::df_loaders::edges::ColumnNames, db::{ - api::{storage::storage::Storage, view::MaterializedGraph}, + api::{state::ops::GraphView, storage::storage::Storage, view::MaterializedGraph}, graph::views::deletion_graph::PersistentGraph, }, errors::GraphError, - io::{ - arrow::df_loaders::edges::ColumnNames, - parquet_loaders::{ - get_parquet_file_paths, load_edge_deletions_from_parquet, - load_edge_metadata_from_parquet, load_edges_from_parquet, - load_graph_props_from_parquet, load_node_metadata_from_parquet, - load_nodes_from_parquet, process_parquet_file_to_df, - }, + io::parquet_loaders::{ + get_parquet_file_paths, load_edge_deletions_from_parquet, load_edge_metadata_from_parquet, + load_edges_from_parquet, load_graph_props_from_parquet, load_node_metadata_from_parquet, + load_nodes_from_parquet, process_parquet_file_to_df, }, - prelude::*, - serialise::{ - parquet::{ - edges::encode_edge_deletions, - graph::{encode_graph_cprop, encode_graph_tprop}, - model::get_id_type, - nodes::{encode_nodes_cprop, encode_nodes_tprop}, - }, - GraphPaths, + parquet_encoder::{ + encode_edge_cprop, encode_edge_deletions, encode_edge_tprop, encode_graph_cprop, + encode_graph_tprop, encode_nodes_cprop, encode_nodes_tprop, RecordBatchSink, DST_COL_ID, + DST_COL_VID, EDGE_COL_ID, LAYER_COL, LAYER_ID_COL, NODE_ID_COL, NODE_VID_COL, + SECONDARY_INDEX_COL, SRC_COL_ID, SRC_COL_VID, TIME_COL, TYPE_COL, TYPE_ID_COL, }, + prelude::*, + serialise::GraphPaths, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; -use arrow_json::{reader::Decoder, ReaderBuilder}; -use edges::{encode_edge_cprop, encode_edge_tprop}; +use arrow::{array::RecordBatch, datatypes::SchemaRef}; use itertools::Itertools; -use model::ParquetTEdge; use parquet::{ arrow::{arrow_reader::ArrowReaderMetadata, ArrowWriter}, basic::Compression, - file::properties::WriterProperties, -}; -use raphtory_api::{ - core::entities::{ - properties::{ - meta::PropMapper, - prop::{arrow_dtype_from_prop_type, prop_col::lift_property_col}, - }, - GidType, - }, - GraphType, + file::{metadata::KeyValue, properties::WriterProperties}, }; -use raphtory_storage::{core_ops::CoreGraphOps, graph::graph::GraphStorage}; -use rayon::prelude::*; +use raphtory_api::{core::entities::properties::prop::prop_col::lift_property_col, GraphType}; +use raphtory_storage::core_ops::CoreGraphOps; use std::{ fs::File, io::{Read, Seek, Write}, - ops::Range, path::{Path, PathBuf}, - sync::Arc, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, }; use storage::Config; use walkdir::WalkDir; use zip::{write::FileOptions, ZipArchive, ZipWriter}; -mod edges; -mod model; -mod nodes; - -mod graph; - pub trait ParquetEncoder { /// Encode the graph as parquet data to the zip writer /// (note the writer is still open for appending more data after calling this function) @@ -147,7 +125,7 @@ pub trait ParquetDecoder: Sized { if let Some(parent) = out_path.parent() { std::fs::create_dir_all(parent)?; } - let mut out_file = std::fs::File::create(&out_path)?; + let mut out_file = File::create(&out_path)?; std::io::copy(&mut file, &mut out_file)?; } } @@ -162,39 +140,17 @@ pub trait ParquetDecoder: Sized { ) -> Result; } -const NODE_ID_COL: &str = "rap_node_id"; -const NODE_VID_COL: &str = "rap_node_vid"; -const TYPE_COL: &str = "rap_node_type"; -const TYPE_ID_COL: &str = "rap_node_type_id"; -const TIME_COL: &str = "rap_time"; -const SECONDARY_INDEX_COL: &str = "rap_secondary_index"; -const SRC_COL_ID: &str = "rap_src_id"; -const DST_COL_ID: &str = "rap_dst_id"; -const EDGE_COL_ID: &str = "rap_edge_id"; -const LAYER_COL: &str = "rap_layer"; -const LAYER_ID_COL: &str = "rap_layer_id"; -const EDGES_T_PATH: &str = "edges_t"; -const EDGES_D_PATH: &str = "edges_d"; // deletions -const EDGES_C_PATH: &str = "edges_c"; -const NODES_T_PATH: &str = "nodes_t"; -const NODES_C_PATH: &str = "nodes_c"; -const GRAPH_T_PATH: &str = "graph_t"; -const GRAPH_C_PATH: &str = "graph_c"; -const GRAPH_TYPE: &str = "graph_type"; -const EVENT_GRAPH_TYPE: &str = "rap_event_graph"; -const PERSISTENT_GRAPH_TYPE: &str = "rap_persistent_graph"; - impl ParquetEncoder for Graph { fn encode_parquet(&self, path: impl AsRef) -> Result<(), GraphError> { let gs = self.core_graph().lock(); - encode_graph_storage(&gs, path, GraphType::EventGraph) + encode_graph_storage_to_parquet(&gs, path, GraphType::EventGraph) } } impl ParquetEncoder for PersistentGraph { fn encode_parquet(&self, path: impl AsRef) -> Result<(), GraphError> { let gs = self.core_graph().lock(); - encode_graph_storage(&gs, path, GraphType::PersistentGraph) + encode_graph_storage_to_parquet(&gs, path, GraphType::PersistentGraph) } } @@ -209,239 +165,184 @@ impl ParquetEncoder for MaterializedGraph { } } -fn encode_graph_storage( - g: &GraphStorage, - path: impl AsRef, - graph_type: GraphType, -) -> Result<(), GraphError> { - encode_edge_tprop(g, path.as_ref())?; - encode_edge_cprop(g, path.as_ref())?; - encode_edge_deletions(g, path.as_ref())?; - encode_nodes_tprop(g, path.as_ref())?; - encode_nodes_cprop(g, path.as_ref())?; - encode_graph_tprop(g, path.as_ref())?; - encode_graph_cprop(g, graph_type, path.as_ref())?; - Ok(()) -} - -pub(crate) fn run_encode( - g: &GraphStorage, - meta: &PropMapper, - size: usize, - path: impl AsRef, - suffix: &str, - default_fields_fn: impl Fn(&DataType) -> Vec, - encode_fn: impl Fn( - Range, - &GraphStorage, - &mut Decoder, - &mut ArrowWriter, - ) -> Result<(), GraphError> - + Sync, -) -> Result<(), GraphError> { - let schema = derive_schema(meta, g.id_type(), default_fields_fn)?; - let root_dir = path.as_ref().join(suffix); - std::fs::create_dir_all(&root_dir)?; - - if size > 0 { - let chunk_size = (size / rayon::current_num_threads()).max(128); - let iter = (0..size).into_par_iter().step_by(chunk_size); - - let num_digits = iter.len().to_string().len(); - - iter.enumerate().try_for_each(|(chunk, first)| { - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let items = first..(first + chunk_size).min(size); - - let node_file = File::create(root_dir.join(format!("{chunk:0num_digits$}.parquet")))?; - let mut writer = ArrowWriter::try_new(node_file, schema.clone(), Some(props))?; - - let mut decoder = ReaderBuilder::new(schema.clone()).build_decoder()?; - - encode_fn(items, g, &mut decoder, &mut writer)?; - - writer.close()?; - Ok::<_, GraphError>(()) - })?; +impl ParquetDecoder for Graph { + fn decode_parquet( + path: impl AsRef, + path_for_decoded_graph: Option<&Path>, + config: Config, + ) -> Result { + let batch_size = None; + let storage = decode_graph_storage(&path, batch_size, path_for_decoded_graph, config)?; + Ok(Graph::from_storage(storage)) } - Ok(()) } -pub(crate) fn run_encode_indexed>( - g: &GraphStorage, - meta: &PropMapper, - items: impl ParallelIterator, - path: impl AsRef, - suffix: &str, - default_fields_fn: impl Fn(&DataType) -> Vec, - encode_fn: impl Fn(II, &GraphStorage, &mut Decoder, &mut ArrowWriter) -> Result<(), GraphError> - + Sync, -) -> Result<(), GraphError> { - let schema = derive_schema(meta, g.id_type(), default_fields_fn)?; - let root_dir = path.as_ref().join(suffix); - std::fs::create_dir_all(&root_dir)?; - - let num_digits = 8; - - items.try_for_each(|(chunk, items)| { - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let node_file = File::create(root_dir.join(format!("{chunk:0num_digits$}.parquet")))?; - let mut writer = ArrowWriter::try_new(node_file, schema.clone(), Some(props))?; - - let mut decoder = ReaderBuilder::new(schema.clone()).build_decoder()?; - - encode_fn(items, g, &mut decoder, &mut writer)?; - - writer.close()?; - Ok::<_, GraphError>(()) - })?; - - Ok(()) -} - -pub(crate) fn derive_schema( - prop_meta: &PropMapper, - id_type: Option, - default_fields_fn: impl Fn(&DataType) -> Vec, -) -> Result { - let fields = arrow_fields(prop_meta); - let id_type = get_id_type(id_type); - - let make_schema = |id_type: DataType, prop_columns: Vec| { - let default_fields = default_fields_fn(&id_type); - - Schema::new( - default_fields - .into_iter() - .chain(prop_columns) - .collect::>(), - ) - .into() - }; - - let schema = if let Ok(id_type) = id_type { - make_schema(id_type, fields) - } else { - make_schema(DataType::UInt64, fields) - }; - - Ok(schema) -} - -fn arrow_fields(meta: &PropMapper) -> Vec { - meta.keys() - .iter() - .zip(meta.ids()) - .filter_map(|(name, prop_id)| { - meta.get_dtype(prop_id) - .map(move |prop_type| (name, prop_type)) - }) - .map(|(name, prop_type)| { - let d_type = arrow_dtype_from_prop_type(&prop_type); - Field::new(name, d_type, true) - }) - .collect() -} - -fn ls_parquet_files(dir: &Path) -> Result, GraphError> { - Ok(std::fs::read_dir(dir) - .inspect_err(|err| { - eprintln!("Error reading directory {}: {}", dir.display(), err); - })? // print out the path if it's missing - .filter_map(Result::ok) - .map(|entry| entry.path()) - .filter(|path| path.is_file() && path.extension().is_some_and(|ext| ext == "parquet"))) +impl ParquetDecoder for PersistentGraph { + fn decode_parquet( + path: impl AsRef, + path_for_decoded_graph: Option<&Path>, + config: Config, + ) -> Result { + let batch_size = None; + let storage = decode_graph_storage(&path, batch_size, path_for_decoded_graph, config)?; + Ok(PersistentGraph(storage)) + } } -fn collect_prop_columns( - path: &Path, - exclude: &[&str], -) -> Result<(Vec, Option), GraphError> { - let prop_columns_fn = - |path: &Path, exclude: &[&str]| -> Result<(Vec, Option), GraphError> { - let reader = ArrowReaderMetadata::load(&File::open(path)?, Default::default())?; - let cols = reader - .schema() - .fields() - .iter() - .map(|f| f.name().to_string()) - .filter(|f_name| !exclude.iter().any(|ex| ex == f_name)) - .collect_vec(); - let graph_type = reader - .metadata() - .file_metadata() - .key_value_metadata() - .and_then(|meta| { - meta.iter() - .find(|kv| kv.key == GRAPH_TYPE) - .and_then(|kv| kv.value.as_ref()) - .and_then(|v| match v.as_ref() { - EVENT_GRAPH_TYPE => Some(GraphType::EventGraph), - PERSISTENT_GRAPH_TYPE => Some(GraphType::PersistentGraph), - _ => None, - }) - }); - Ok((cols, graph_type)) - }; - - let mut prop_columns = vec![]; - let mut g_type: Option = None; - - // Collect columns from just the first file - if let Some(path) = ls_parquet_files(path)?.next() { - let (columns, tpe) = prop_columns_fn(&path, exclude)?; +impl ParquetDecoder for MaterializedGraph { + fn decode_parquet( + path: impl AsRef, + path_for_decoded_graph: Option<&Path>, + config: Config, + ) -> Result { + let batch_size = None; + let graph_type = decode_graph_type(&path)?; + let storage = decode_graph_storage(&path, batch_size, path_for_decoded_graph, config)?; - if g_type.is_none() { - g_type = tpe; + match graph_type { + GraphType::EventGraph => { + Ok(MaterializedGraph::EventGraph(Graph::from_storage(storage))) + } + GraphType::PersistentGraph => { + Ok(MaterializedGraph::PersistentGraph(PersistentGraph(storage))) + } } - - prop_columns.extend_from_slice(&columns); } - - Ok((prop_columns, g_type)) } -fn decode_graph_type(path: impl AsRef) -> Result { - let c_graph_path = path.as_ref().join(GRAPH_C_PATH); +const EDGES_T_PATH: &str = "edges_t"; +const EDGES_D_PATH: &str = "edges_d"; // deletions +const EDGES_C_PATH: &str = "edges_c"; +const NODES_T_PATH: &str = "nodes_t"; +const NODES_C_PATH: &str = "nodes_c"; +const GRAPH_T_PATH: &str = "graph_t"; +const GRAPH_C_PATH: &str = "graph_c"; +const GRAPH_TYPE: &str = "graph_type"; +const EVENT_GRAPH_TYPE: &str = "rap_event_graph"; +const PERSISTENT_GRAPH_TYPE: &str = "rap_persistent_graph"; - // Assume event graph as default - if !std::fs::exists(&c_graph_path)? { - return Ok(GraphType::EventGraph); +impl RecordBatchSink for ArrowWriter +where + W: Write + Seek + Send, +{ + fn send_batch(&mut self, batch: RecordBatch) -> Result<(), GraphError> { + ArrowWriter::write(self, &batch)?; + ArrowWriter::flush(self)?; + Ok(()) } - let exclude = vec![TIME_COL]; - let (_, g_type) = collect_prop_columns(&c_graph_path, &exclude)?; - - g_type.ok_or_else(|| GraphError::LoadFailure("Graph type not found".to_string())) + fn finish(self) -> Result<(), GraphError> { + self.close()?; + Ok(()) + } } -pub fn decode_graph_metadata( - path: &impl GraphPaths, -) -> Result)>, GraphError> { - let c_graph_path = path.graph_path()?.join(GRAPH_C_PATH); - let exclude = vec![TIME_COL]; - let (c_props, _) = collect_prop_columns(&c_graph_path, &exclude)?; - let c_props = c_props.iter().map(|s| s.as_str()).collect::>(); - let mut result: Vec<(String, Option)> = - c_props.iter().map(|s| (s.to_string(), None)).collect(); +fn create_arrow_writer_sink( + root_dir: &Path, + schema: SchemaRef, + file_id: usize, + filename_num_digits: usize, + key_value_metadata: Option>, +) -> Result, GraphError> { + std::fs::create_dir_all(&root_dir)?; - for path in get_parquet_file_paths(&c_graph_path)? { - let df_view = process_parquet_file_to_df(path.as_path(), Some(&c_props), None, None)?; - for chunk in df_view.chunks { - let chunk = chunk?; - for (col, res) in chunk.chunk.into_iter().zip(&mut result) { - if let Some(value) = lift_property_col(&col).get(0) { - res.1 = Some(value); - } - } - } - } - Ok(result) + let writer_properties = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_key_value_metadata(key_value_metadata) + .build(); + + let filename = format!("{file_id:0filename_num_digits$}.parquet"); + let node_file = File::create(root_dir.join(filename))?; + Ok(ArrowWriter::try_new( + node_file, + schema.clone(), + Some(writer_properties), + )?) +} + +fn encode_graph_storage_to_parquet( + g: &G, + path: impl AsRef, + graph_type: GraphType, +) -> Result<(), GraphError> { + let base_dir = path.as_ref(); + let edge_t_file_id = AtomicUsize::new(0); + let edge_c_file_id = AtomicUsize::new(0); + let edge_d_file_id = AtomicUsize::new(0); + + encode_edge_tprop(g, |schema, _chunk, num_digits| { + create_arrow_writer_sink( + &base_dir.join(EDGES_T_PATH), + schema.clone(), + edge_t_file_id.fetch_add(1, Ordering::Relaxed), + num_digits, + None, + ) + })?; + encode_edge_cprop(g, |schema, _chunk, num_digits| { + create_arrow_writer_sink( + &base_dir.join(EDGES_C_PATH), + schema.clone(), + edge_c_file_id.fetch_add(1, Ordering::Relaxed), + num_digits, + None, + ) + })?; + encode_edge_deletions(g, |schema, _chunk, num_digits| { + create_arrow_writer_sink( + &base_dir.join(EDGES_D_PATH), + schema.clone(), + edge_d_file_id.fetch_add(1, Ordering::Relaxed), + num_digits, + None, + ) + })?; + encode_nodes_tprop(g, |schema, chunk, num_digits| { + create_arrow_writer_sink( + &base_dir.join(NODES_T_PATH), + schema.clone(), + chunk, + num_digits, + None, + ) + })?; + encode_nodes_cprop(g, |schema, chunk, num_digits| { + create_arrow_writer_sink( + &base_dir.join(NODES_C_PATH), + schema.clone(), + chunk, + num_digits, + None, + ) + })?; + encode_graph_tprop(g, |schema, chunk, num_digits| { + create_arrow_writer_sink( + &base_dir.join(GRAPH_T_PATH), + schema.clone(), + chunk, + num_digits, + None, + ) + })?; + encode_graph_cprop(g, |schema, chunk, num_digits| { + let graph_type_str = match graph_type { + GraphType::EventGraph => EVENT_GRAPH_TYPE, + GraphType::PersistentGraph => PERSISTENT_GRAPH_TYPE, + }; + let key_value_metadata = vec![KeyValue::new( + GRAPH_TYPE.to_string(), + Some(graph_type_str.to_string()), + )]; + + create_arrow_writer_sink( + &base_dir.join(GRAPH_C_PATH), + schema.clone(), + chunk, + num_digits, + Some(key_value_metadata), + ) + })?; + Ok(()) } fn decode_graph_storage( @@ -522,7 +423,13 @@ fn decode_graph_storage( let t_node_path = path.as_ref().join(NODES_T_PATH); if std::fs::exists(&t_node_path)? { - let exclude = vec![NODE_VID_COL, TIME_COL, SECONDARY_INDEX_COL]; + let exclude = vec![ + NODE_ID_COL, + NODE_VID_COL, + TYPE_COL, + TIME_COL, + SECONDARY_INDEX_COL, + ]; let (t_prop_columns, _) = collect_prop_columns(&t_node_path, &exclude)?; let t_prop_columns = t_prop_columns .iter() @@ -554,7 +461,9 @@ fn decode_graph_storage( let exclude = vec![ TIME_COL, SECONDARY_INDEX_COL, + SRC_COL_VID, SRC_COL_ID, + DST_COL_VID, DST_COL_ID, LAYER_COL, LAYER_ID_COL, @@ -572,8 +481,8 @@ fn decode_graph_storage( ColumnNames::new( TIME_COL, Some(SECONDARY_INDEX_COL), - SRC_COL_ID, - DST_COL_ID, + SRC_COL_VID, + DST_COL_VID, Some(LAYER_COL), ) .with_layer_id_col(LAYER_ID_COL) @@ -597,8 +506,8 @@ fn decode_graph_storage( ColumnNames::new( TIME_COL, Some(SECONDARY_INDEX_COL), - SRC_COL_ID, - DST_COL_ID, + SRC_COL_VID, + DST_COL_VID, Some(LAYER_COL), ) .with_layer_id_col(LAYER_ID_COL) @@ -613,7 +522,14 @@ fn decode_graph_storage( let c_edge_path = path.as_ref().join(EDGES_C_PATH); if std::fs::exists(&c_edge_path)? { - let exclude = vec![SRC_COL_ID, DST_COL_ID, LAYER_COL, EDGE_COL_ID]; + let exclude = vec![ + SRC_COL_VID, + SRC_COL_ID, + DST_COL_VID, + DST_COL_ID, + LAYER_COL, + EDGE_COL_ID, + ]; let (c_prop_columns, _) = collect_prop_columns(&c_edge_path, &exclude)?; let metadata = c_prop_columns .iter() @@ -623,8 +539,8 @@ fn decode_graph_storage( load_edge_metadata_from_parquet( &graph, &c_edge_path, - SRC_COL_ID, - DST_COL_ID, + SRC_COL_VID, + DST_COL_VID, &metadata, None, None, @@ -637,47 +553,98 @@ fn decode_graph_storage( Ok(graph) } -impl ParquetDecoder for Graph { - fn decode_parquet( - path: impl AsRef, - path_for_decoded_graph: Option<&Path>, - config: Config, - ) -> Result { - let batch_size = None; - let storage = decode_graph_storage(&path, batch_size, path_for_decoded_graph, config)?; - Ok(Graph::from_storage(storage)) +pub fn decode_graph_metadata( + path: &impl GraphPaths, +) -> Result)>, GraphError> { + let c_graph_path = path.graph_path()?.join(GRAPH_C_PATH); + let exclude = vec![TIME_COL]; + let (c_props, _) = collect_prop_columns(&c_graph_path, &exclude)?; + let c_props = c_props.iter().map(|s| s.as_str()).collect::>(); + let mut result: Vec<(String, Option)> = + c_props.iter().map(|s| (s.to_string(), None)).collect(); + + for path in get_parquet_file_paths(&c_graph_path)? { + let df_view = process_parquet_file_to_df(path.as_path(), Some(&c_props), None, None)?; + for chunk in df_view.chunks { + let chunk = chunk?; + for (col, res) in chunk.chunk.into_iter().zip(&mut result) { + if let Some(value) = lift_property_col(&col).get(0) { + res.1 = Some(value); + } + } + } } + Ok(result) } -impl ParquetDecoder for PersistentGraph { - fn decode_parquet( - path: impl AsRef, - path_for_decoded_graph: Option<&Path>, - config: Config, - ) -> Result { - let batch_size = None; - let storage = decode_graph_storage(&path, batch_size, path_for_decoded_graph, config)?; - Ok(PersistentGraph(storage)) +fn decode_graph_type(path: impl AsRef) -> Result { + let c_graph_path = path.as_ref().join(GRAPH_C_PATH); + + // Assume event graph as default + if !std::fs::exists(&c_graph_path)? { + return Ok(GraphType::EventGraph); } + + let exclude = vec![TIME_COL]; + let (_, g_type) = collect_prop_columns(&c_graph_path, &exclude)?; + + g_type.ok_or_else(|| GraphError::LoadFailure("Graph type not found".to_string())) } -impl ParquetDecoder for MaterializedGraph { - fn decode_parquet( - path: impl AsRef, - path_for_decoded_graph: Option<&Path>, - config: Config, - ) -> Result { - let batch_size = None; - let graph_type = decode_graph_type(&path)?; - let storage = decode_graph_storage(&path, batch_size, path_for_decoded_graph, config)?; +fn collect_prop_columns( + path: &Path, + exclude: &[&str], +) -> Result<(Vec, Option), GraphError> { + let prop_columns_fn = + |path: &Path, exclude: &[&str]| -> Result<(Vec, Option), GraphError> { + let reader = ArrowReaderMetadata::load(&File::open(path)?, Default::default())?; + let cols = reader + .schema() + .fields() + .iter() + .map(|f| f.name().to_string()) + .filter(|f_name| !exclude.iter().any(|ex| ex == f_name)) + .collect_vec(); + let graph_type = reader + .metadata() + .file_metadata() + .key_value_metadata() + .and_then(|meta| { + meta.iter() + .find(|kv| kv.key == GRAPH_TYPE) + .and_then(|kv| kv.value.as_ref()) + .and_then(|v| match v.as_ref() { + EVENT_GRAPH_TYPE => Some(GraphType::EventGraph), + PERSISTENT_GRAPH_TYPE => Some(GraphType::PersistentGraph), + _ => None, + }) + }); + Ok((cols, graph_type)) + }; - match graph_type { - GraphType::EventGraph => { - Ok(MaterializedGraph::EventGraph(Graph::from_storage(storage))) - } - GraphType::PersistentGraph => { - Ok(MaterializedGraph::PersistentGraph(PersistentGraph(storage))) - } + let mut prop_columns = vec![]; + let mut g_type: Option = None; + + // Collect columns from just the first file + if let Some(path) = ls_parquet_files(path)?.next() { + let (columns, tpe) = prop_columns_fn(&path, exclude)?; + + if g_type.is_none() { + g_type = tpe; } + + prop_columns.extend_from_slice(&columns); } + + Ok((prop_columns, g_type)) +} + +fn ls_parquet_files(dir: &Path) -> Result, GraphError> { + Ok(std::fs::read_dir(dir) + .inspect_err(|err| { + eprintln!("Error reading directory {}: {}", dir.display(), err); + })? // print out the path if it's missing + .filter_map(Result::ok) + .map(|entry| entry.path()) + .filter(|path| path.is_file() && path.extension().is_some_and(|ext| ext == "parquet"))) } diff --git a/raphtory/src/serialise/parquet/edges.rs b/raphtory/src/serialise/parquet/edges.rs deleted file mode 100644 index dc58a8390f..0000000000 --- a/raphtory/src/serialise/parquet/edges.rs +++ /dev/null @@ -1,178 +0,0 @@ -use super::*; -use crate::{ - core::utils::iter::GenLockedIter, db::graph::edge::EdgeView, errors::GraphError, - serialise::parquet::model::ParquetDelEdge, -}; -use arrow::datatypes::{DataType, Field}; -use model::ParquetCEdge; -use raphtory_api::{core::storage::timeindex::TimeIndexOps, iter::IntoDynBoxed}; -use raphtory_storage::{ - core_ops::CoreGraphOps, - graph::{edges::edge_storage_ops::EdgeStorageOps, graph::GraphStorage}, -}; -use std::path::Path; - -pub(crate) fn encode_edge_tprop( - g: &GraphStorage, - path: impl AsRef, -) -> Result<(), GraphError> { - run_encode_indexed( - g, - g.edge_meta().temporal_prop_mapper(), - g.edges().segmented_par_iter().unwrap_or_else(|| { - panic!("Internal Error: segmented_par_iter cannot be called from unlocked GraphStorage") - }), - path, - EDGES_T_PATH, - |_| { - vec![ - Field::new(TIME_COL, DataType::Int64, false), - Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - Field::new(SRC_COL_ID, DataType::UInt64, false), - Field::new(DST_COL_ID, DataType::UInt64, false), - Field::new(EDGE_COL_ID, DataType::UInt64, false), - Field::new(LAYER_COL, DataType::Utf8, true), - Field::new(LAYER_ID_COL, DataType::UInt64, true), - ] - }, - |edges, g, decoder, writer| { - let row_group_size = 100_000; - - for edge_rows in edges - .into_iter() - .flat_map(|eid| { - let edge_ref = g.core_edge(eid).out_ref(); - EdgeView::new(g, edge_ref).explode() - }) - .map(ParquetTEdge) - .chunks(row_group_size) - .into_iter() - .map(|chunk| chunk.collect_vec()) - { - decoder.serialize(&edge_rows)?; - if let Some(rb) = decoder.flush()? { - writer.write(&rb)?; - writer.flush()?; - } - } - Ok(()) - }, - ) -} - -pub(crate) fn encode_edge_deletions( - g: &GraphStorage, - path: impl AsRef, -) -> Result<(), GraphError> { - run_encode_indexed( - g, - g.edge_meta().temporal_prop_mapper(), - g.edges().segmented_par_iter().unwrap_or_else(|| { - panic!("Internal Error: segmented_par_iter cannot be called from unlocked GraphStorage") - }), - path, - EDGES_D_PATH, - |_| { - vec![ - Field::new(TIME_COL, DataType::Int64, false), - Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - Field::new(SRC_COL_ID, DataType::UInt64, false), - Field::new(DST_COL_ID, DataType::UInt64, false), - Field::new(EDGE_COL_ID, DataType::UInt64, false), - Field::new(LAYER_COL, DataType::Utf8, true), - Field::new(LAYER_ID_COL, DataType::UInt64, true), - ] - }, - |edges, g, decoder, writer| { - let row_group_size = 100_000; - let g = g.lock(); - let g = &g; - let g_edges = g.edges(); - let layers = g - .unique_layers() - .map(|s| s.to_string().to_owned()) - .collect::>(); - let layers = &layers; - - for edge_rows in edges - .into_iter() - .flat_map(|eid| { - g.unfiltered_layer_ids().flat_map(move |layer_id| { - let edge = g_edges.edge(eid); - let edge_ref = edge.out_ref(); - GenLockedIter::from(edge, |edge| { - edge.deletions(layer_id).iter().into_dyn_boxed() - }) - .map(move |deletions| ParquetDelEdge { - del: deletions, - layer: &layers[layer_id.0 - 1], - layer_id, - edge: EdgeView::new(g, edge_ref), - }) - }) - }) - .chunks(row_group_size) - .into_iter() - .map(|chunk| chunk.collect_vec()) - { - decoder.serialize(&edge_rows)?; - if let Some(rb) = decoder.flush()? { - writer.write(&rb)?; - writer.flush()?; - } - } - Ok(()) - }, - ) -} - -pub(crate) fn encode_edge_cprop( - g: &GraphStorage, - path: impl AsRef, -) -> Result<(), GraphError> { - run_encode_indexed( - g, - g.edge_meta().metadata_mapper(), - g.edges().segmented_par_iter().unwrap_or_else(|| { - panic!("Internal Error: segmented_par_iter cannot be called from unlocked GraphStorage") - }), - path, - EDGES_C_PATH, - |_| { - vec![ - Field::new(SRC_COL_ID, DataType::UInt64, false), - Field::new(DST_COL_ID, DataType::UInt64, false), - Field::new(EDGE_COL_ID, DataType::UInt64, false), - Field::new(LAYER_COL, DataType::Utf8, true), - ] - }, - |edges, g, decoder, writer| { - let row_group_size = 100_000; - - for edge_rows in edges - .into_iter() - .flat_map(|eid| { - let edge_ref = g.core_edge(eid).out_ref(); - let edges_in_layers = EdgeView::new(g, edge_ref) - .explode_layers() - .into_iter() - .map(|e| e.edge) - .collect_vec(); - - edges_in_layers.into_iter() - }) - .map(|edge| ParquetCEdge(EdgeView::new(g, edge))) - .chunks(row_group_size) - .into_iter() - .map(|chunk| chunk.collect_vec()) - { - decoder.serialize(&edge_rows)?; - if let Some(rb) = decoder.flush()? { - writer.write(&rb)?; - writer.flush()?; - } - } - Ok(()) - }, - ) -} diff --git a/raphtory/src/serialise/parquet/model.rs b/raphtory/src/serialise/parquet/model.rs deleted file mode 100644 index e75d829334..0000000000 --- a/raphtory/src/serialise/parquet/model.rs +++ /dev/null @@ -1,187 +0,0 @@ -use super::{Prop, LAYER_COL, NODE_ID_COL, SECONDARY_INDEX_COL, TIME_COL, TYPE_COL}; -use crate::{ - db::{ - api::view::StaticGraphViewOps, - graph::{edge::EdgeView, node::NodeView}, - }, - prelude::*, - serialise::parquet::{ - DST_COL_ID, EDGE_COL_ID, LAYER_ID_COL, NODE_VID_COL, SRC_COL_ID, TYPE_ID_COL, - }, -}; -use arrow::datatypes::DataType; -use raphtory_api::core::{ - entities::{properties::prop::SerdeArrowProp, GidType, LayerId}, - storage::{arc_str::ArcStr, timeindex::EventTime}, -}; -use raphtory_storage::graph::graph::GraphStorage; -use serde::{ - ser::{Error, SerializeMap}, - Serialize, -}; - -#[derive(Debug)] -struct ParquetGID(GID); - -impl Serialize for ParquetGID { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - match &self.0 { - GID::U64(id) => serializer.serialize_u64(*id), - GID::Str(id) => serializer.serialize_str(id), - } - } -} - -#[derive(Debug)] -pub(crate) struct ParquetTEdge<'a, G: StaticGraphViewOps>(pub(crate) EdgeView<&'a G>); - -impl<'a, G: StaticGraphViewOps> Serialize for ParquetTEdge<'a, G> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let edge = &self.0; - let mut state = serializer.serialize_map(None)?; - let t = edge - .edge - .time() - .ok_or(S::Error::custom("Edge has no time"))?; - let layer = edge - .layer_name() - .map_err(|_| S::Error::custom("Edge has no layer"))?; - - let layer_id = edge - .edge - .layer() - .ok_or_else(|| S::Error::custom("Edge has no layer"))?; - - state.serialize_entry(TIME_COL, &t.0)?; - state.serialize_entry(SECONDARY_INDEX_COL, &t.1)?; - state.serialize_entry(SRC_COL_ID, &edge.src().node.0)?; - state.serialize_entry(DST_COL_ID, &edge.dst().node.0)?; - state.serialize_entry(EDGE_COL_ID, &edge.edge.pid())?; - state.serialize_entry(LAYER_COL, &layer)?; - state.serialize_entry(LAYER_ID_COL, &layer_id.0)?; - - for (name, prop) in edge.properties().temporal().iter_latest() { - state.serialize_entry(&name, &SerdeArrowProp(&prop))?; - } - - state.end() - } -} - -#[derive(Debug)] -pub(crate) struct ParquetCEdge<'a, G: StaticGraphViewOps>(pub(crate) EdgeView<&'a G>); - -impl<'a, G: StaticGraphViewOps> Serialize for ParquetCEdge<'a, G> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let edge = &self.0; - let mut state = serializer.serialize_map(None)?; - let layer = edge - .layer_name() - .map_err(|_| S::Error::custom("Edge has no layer"))?; - - state.serialize_entry(SRC_COL_ID, &(edge.src().node.0))?; - state.serialize_entry(DST_COL_ID, &(edge.dst().node.0))?; - state.serialize_entry(EDGE_COL_ID, &(edge.edge.pid().0))?; - state.serialize_entry(LAYER_COL, &layer)?; - - for (name, prop) in edge.metadata().iter_filtered() { - state.serialize_entry(&name, &SerdeArrowProp(&prop))?; - } - - state.end() - } -} - -pub(crate) struct ParquetDelEdge<'a, G> { - pub layer: &'a str, - pub layer_id: LayerId, - pub edge: EdgeView<&'a G>, - pub del: EventTime, -} - -impl<'a, G: StaticGraphViewOps> Serialize for ParquetDelEdge<'a, G> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let edge = &self.edge; - let mut state = serializer.serialize_map(None)?; - - state.serialize_entry(TIME_COL, &self.del.0)?; - state.serialize_entry(SECONDARY_INDEX_COL, &self.del.1)?; - state.serialize_entry(SRC_COL_ID, &(edge.src().node.0))?; - state.serialize_entry(DST_COL_ID, &(edge.dst().node.0))?; - state.serialize_entry(EDGE_COL_ID, &(edge.edge.pid().0))?; - state.serialize_entry(LAYER_COL, &self.layer)?; - state.serialize_entry(LAYER_ID_COL, &self.layer_id.0)?; - - state.end() - } -} - -pub(crate) struct ParquetTNode<'a> { - pub node: NodeView<'a, &'a GraphStorage>, - pub cols: &'a [ArcStr], - pub t: EventTime, - pub props: Vec<(usize, Prop)>, -} - -impl<'a> Serialize for ParquetTNode<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let mut state = serializer.serialize_map(None)?; - - state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; - state.serialize_entry(TIME_COL, &self.t.0)?; - state.serialize_entry(SECONDARY_INDEX_COL, &self.t.1)?; - - for (name, prop) in self.props.iter() { - state.serialize_entry(&self.cols[*name], &SerdeArrowProp(prop))?; - } - - state.end() - } -} - -pub(crate) struct ParquetCNode<'a> { - pub node: NodeView<'a, &'a GraphStorage>, -} - -impl<'a> Serialize for ParquetCNode<'a> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - let mut state = serializer.serialize_map(None)?; - - state.serialize_entry(NODE_ID_COL, &ParquetGID(self.node.id()))?; - state.serialize_entry(NODE_VID_COL, &self.node.node.0)?; - state.serialize_entry(TYPE_COL, &self.node.node_type())?; - state.serialize_entry(TYPE_ID_COL, &self.node.node_type_id())?; - - for (name, prop) in self.node.metadata().iter_filtered() { - state.serialize_entry(&name, &SerdeArrowProp(&prop))?; - } - - state.end() - } -} - -pub(crate) fn get_id_type(id_type: Option) -> Result { - match id_type { - Some(GidType::Str) => Ok(DataType::Utf8), - Some(GidType::U64) => Ok(DataType::UInt64), - None => Err(DataType::UInt64), // The graph is empty what now? - } -} diff --git a/raphtory/src/serialise/parquet/nodes.rs b/raphtory/src/serialise/parquet/nodes.rs deleted file mode 100644 index 0852d52645..0000000000 --- a/raphtory/src/serialise/parquet/nodes.rs +++ /dev/null @@ -1,111 +0,0 @@ -use crate::{ - core::utils::iter::GenLockedIter, - db::graph::node::NodeView, - errors::GraphError, - serialise::parquet::{ - model::{ParquetCNode, ParquetTNode}, - run_encode_indexed, NODES_C_PATH, NODES_T_PATH, NODE_ID_COL, NODE_VID_COL, - SECONDARY_INDEX_COL, TIME_COL, TYPE_COL, TYPE_ID_COL, - }, -}; -use arrow::datatypes::{DataType, Field}; -use itertools::Itertools; -use raphtory_api::iter::IntoDynBoxed; -use raphtory_storage::graph::graph::GraphStorage; -use std::path::Path; - -pub(crate) fn encode_nodes_tprop( - g: &GraphStorage, - path: impl AsRef, -) -> Result<(), GraphError> { - run_encode_indexed( - g, - g.node_meta().temporal_prop_mapper(), - g.nodes().row_groups_par_iter(), - path, - NODES_T_PATH, - |_| { - vec![ - Field::new(NODE_VID_COL, DataType::UInt64, false), - Field::new(TIME_COL, DataType::Int64, false), - Field::new(SECONDARY_INDEX_COL, DataType::UInt64, true), - ] - }, - |nodes, g, decoder, writer| { - let row_group_size = 100_000; - let nodes = nodes.collect::>(); - - let nodes = nodes.into_iter(); - - let cols = g.node_meta().temporal_prop_mapper().all_keys(); - let cols = &cols; - for node_rows in nodes - .map(|vid| NodeView::new_internal(g, vid)) - .flat_map(move |node| { - GenLockedIter::from(node, |node| { - node.rows() - .map(|(t, _, props)| ParquetTNode { - node: *node, - cols, - t, - props, - }) - .into_dyn_boxed() - }) - }) - .chunks(row_group_size) - .into_iter() - .map(|chunk| chunk.collect_vec()) - { - decoder.serialize(&node_rows)?; - if let Some(rb) = decoder.flush()? { - writer.write(&rb)?; - writer.flush()?; - } - } - Ok(()) - }, - ) -} - -pub(crate) fn encode_nodes_cprop( - g: &GraphStorage, - path: impl AsRef, -) -> Result<(), GraphError> { - run_encode_indexed( - g, - g.node_meta().metadata_mapper(), - g.nodes().row_groups_par_iter(), - path, - NODES_C_PATH, - |id_type| { - vec![ - Field::new(NODE_ID_COL, id_type.clone(), false), - Field::new(NODE_VID_COL, DataType::UInt64, false), - Field::new(TYPE_COL, DataType::Utf8, true), - Field::new(TYPE_ID_COL, DataType::UInt64, true), - ] - }, - |nodes, g, decoder, writer| { - let row_group_size = 100_000; - - for node_rows in nodes - .map(|vid| NodeView::new_internal(g, vid)) - .map(move |node| ParquetCNode { node }) - .chunks(row_group_size) - .into_iter() - .map(|chunk| chunk.collect_vec()) - // scope for the decoder - { - decoder.serialize(&node_rows)?; - - if let Some(rb) = decoder.flush()? { - writer.write(&rb)?; - writer.flush()?; - } - } - - Ok(()) - }, - ) -} diff --git a/raphtory/tests/df_loaders.rs b/raphtory/tests/df_loaders.rs index 6f73e97758..970399c7c7 100644 --- a/raphtory/tests/df_loaders.rs +++ b/raphtory/tests/df_loaders.rs @@ -6,15 +6,15 @@ mod io_tests { use itertools::Itertools; use proptest::proptest; use raphtory::{ - db::graph::graph::assert_graph_equal, - errors::GraphError, - io::arrow::{ + arrow_loader::{ dataframe::{DFChunk, DFView}, df_loaders::{ edges::{load_edges_from_df_prefetch, ColumnNames}, nodes::{load_node_props_from_df, load_nodes_from_df}, }, }, + db::graph::graph::assert_graph_equal, + errors::GraphError, prelude::*, test_utils::{build_edge_list, build_edge_list_str, build_edge_list_with_secondary_index}, }; @@ -663,6 +663,7 @@ mod io_tests { &[], None, &g, + false, ) .unwrap(); diff --git a/raphtory/tests/test_materialize_sf10.rs b/raphtory/tests/test_materialize_sf10.rs new file mode 100644 index 0000000000..6b5df040a0 --- /dev/null +++ b/raphtory/tests/test_materialize_sf10.rs @@ -0,0 +1,752 @@ +use chrono::Local; +use parquet::arrow::arrow_reader::ArrowReaderMetadata; +use raphtory::{ + arrow_loader::df_loaders::edges::ColumnNames, + db::{ + api::view::{materialize_impl, MaterializedGraph}, + graph::graph::{assert_graph_equal_timestamps, graph_equal}, + }, + prelude::{AdditionOps, DeletionOps, Graph, GraphViewOps, LayerOps, PropertyAdditionOps}, +}; +#[cfg(feature = "io")] +use raphtory::{ + io::parquet_loaders::{ + get_parquet_file_paths, load_edge_deletions_from_parquet, load_edge_metadata_from_parquet, + load_edges_from_parquet, load_graph_props_from_parquet, load_node_metadata_from_parquet, + load_nodes_from_parquet, + }, + prelude::{ParquetDecoder, ParquetEncoder}, +}; +use raphtory_storage::core_ops::CoreGraphOps; +use std::{ + fs, io, + path::{Path, PathBuf}, + time::{Duration, Instant}, +}; +use storage::persist::strategy::PersistenceStrategy; + +#[cfg(feature = "io")] +fn default_sf10_graph_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../../ldbc/data/social_network-sf10-CsvComposite-LongDateFormatter/graph") +} + +#[cfg(feature = "io")] +fn default_sf10_parquet_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join( + "../../ldbc/data/social_network-sf10-CsvComposite-LongDateFormatter/parquet/data0/graph0", + ) +} + +#[cfg(feature = "io")] +fn default_sf1_graph_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../../ldbc/data/social_network-sf1-CsvComposite-LongDateFormatter/graph") +} + +#[cfg(feature = "io")] +fn default_sf1_parquet_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join( + "../../ldbc/data/social_network-sf1-CsvComposite-LongDateFormatter/parquet/data0/graph0", + ) +} + +fn default_materialized_graphs_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../ldbc/data/materialized-graphs") +} + +fn remove_dir_all_ignore_not_found(path: impl AsRef) -> io::Result<()> { + match fs::remove_dir_all(path.as_ref()) { + Ok(()) => Ok(()), + Err(err) if err.kind() == io::ErrorKind::NotFound => Ok(()), + Err(err) => Err(err), + } +} + +#[test] +fn test_materialize_using_recordbatches_matches_materialize() { + let g = Graph::new(); + g.add_node(0, "A", [("node_meta", "alpha")], Some("TypeA"), None) + .unwrap(); + g.add_node(1, "B", [("node_meta", "beta")], None, None) + .unwrap(); + g.add_edge(2, "A", "B", [("weight", 1)], Some("layer1")) + .unwrap(); + g.add_edge(3, "A", "B", [("weight", 2)], Some("layer2")) + .unwrap(); + g.delete_edge(4, "A", "B", Some("layer1")).unwrap(); + g.add_properties(5, [("graph_prop", "present")]).unwrap(); + g.add_metadata([("graph_meta", "constant")]).unwrap(); + + let expected = g.materialize().unwrap(); + let actual = materialize_impl(&g, None, g.core_graph().extension().config().clone()).unwrap(); + + assert_graph_equal_timestamps(&expected, &actual); +} + +#[cfg(feature = "io")] +#[test] +#[ignore = "requires a locally persisted SNB SF1 graph produced by ldbc/load_snb_sf10.py"] +fn test_materialize_snb_sf1_timings() { + let graph_path = default_sf1_graph_path(); + let old_materialize_graph_path = default_materialized_graphs_path().join("old_materialize"); + let rb_materialize_graph_path = default_materialized_graphs_path().join("rb_materialize"); + // clear out the directories in case they had previous files in them + remove_dir_all_ignore_not_found(&old_materialize_graph_path).unwrap(); + remove_dir_all_ignore_not_found(&rb_materialize_graph_path).unwrap(); + fs::create_dir_all(&old_materialize_graph_path).unwrap(); + fs::create_dir_all(&rb_materialize_graph_path).unwrap(); + + if !graph_path.exists() { + eprintln!("SNB graph not found at {}", graph_path.display()); + return; + } + + println!("Loading SNB graph from {}", graph_path.display()); + let g = Graph::load(&graph_path).unwrap(); + println!( + "Loaded source graph: {} nodes, {} edges, {} temporal edges", + g.count_nodes(), + g.count_edges(), + g.count_temporal_edges() + ); + + println!( + "Starting materialize using RecordBatches at {}", + Local::now() + ); + let recordbatch_start = Instant::now(); + let recordbatch_graph = materialize_impl( + &g, + Some(&rb_materialize_graph_path), + g.core_graph().extension().config().clone(), + ) + .unwrap(); + let recordbatch_elapsed = recordbatch_start.elapsed(); + println!( + "Finished materialize using RecordBatches at {}\nTook {recordbatch_elapsed:?}", + Local::now() + ); + + println!("Starting materialize impl (old) at {}", Local::now()); + let impl_start = Instant::now(); + let materialize_impl_graph = g.materialize_at(&old_materialize_graph_path).unwrap(); + let impl_elapsed = impl_start.elapsed(); + println!( + "Finished materialize impl (old) at {}\nTook {impl_elapsed:?}", + Local::now() + ); + + assert!(graph_equal(&g, &materialize_impl_graph)); + assert!(graph_equal(&g, &recordbatch_graph)); + + let impl_secs = impl_elapsed.as_secs_f64(); + let recordbatch_secs = recordbatch_elapsed.as_secs_f64(); + let faster = if impl_secs < recordbatch_secs { + "materialize_impl" + } else if recordbatch_secs < impl_secs { + "materialize_using_recordbatches" + } else { + "tie" + }; + let ratio = if impl_secs > 0.0 && recordbatch_secs > 0.0 { + if impl_secs > recordbatch_secs { + impl_secs / recordbatch_secs + } else { + recordbatch_secs / impl_secs + } + } else { + 1.0 + }; + + println!("Faster path: {faster} ({ratio:.2}x)"); +} + +#[cfg(feature = "io")] +#[test] +#[ignore = "requires a locally persisted SNB SF1 graph produced by ldbc/load_snb_sf10.py"] +fn test_materialize_filtered_sf1_matches() { + let graph_path = default_sf1_graph_path(); + let old_materialize_graph_path = + default_materialized_graphs_path().join("sf1_filtered_materialize_old"); + let rb_materialize_graph_path = + default_materialized_graphs_path().join("sf1_filtered_materialize_rb"); + + remove_dir_all_ignore_not_found(&old_materialize_graph_path).unwrap(); + remove_dir_all_ignore_not_found(&rb_materialize_graph_path).unwrap(); + fs::create_dir_all(&old_materialize_graph_path).unwrap(); + fs::create_dir_all(&rb_materialize_graph_path).unwrap(); + + if !graph_path.exists() { + eprintln!("SNB graph not found at {}", graph_path.display()); + return; + } + + let selected_node_types = ["Person", "Forum", "Post", "Comment"]; + let selected_layers = [ + "KNOWS", + "LIKES", + "HAS_MEMBER", + "HAS_CREATOR", + "HAS_MODERATOR", + "CONTAINER_OF", + "REPLY_OF", + ]; + + println!( + "Loading filtered-view SF1 source graph from {}", + graph_path.display() + ); + let g = Graph::load(&graph_path).unwrap(); + + let total_nodes = g.count_nodes(); + let total_edges = g.count_edges(); + let total_temporal_edges = g.count_temporal_edges(); + + let filtered = g + .subgraph_node_types(selected_node_types) + .layers(selected_layers) + .unwrap(); + + let selected_nodes = filtered.count_nodes(); + let selected_edges = filtered.count_edges(); + let selected_temporal_edges = filtered.count_temporal_edges(); + + println!( + "Filtered SF1 view uses node types {:?} and layers {:?}", + selected_node_types, selected_layers + ); + let nodes_percent = (selected_nodes * 100).checked_div(total_nodes).unwrap_or(0); + let edges_percent = (selected_edges * 100).checked_div(total_edges).unwrap_or(0); + let temporal_edges_percent = (selected_temporal_edges * 100) + .checked_div(total_temporal_edges) + .unwrap_or(0); + + println!( + "Selected {selected_nodes}/{total_nodes} nodes ({nodes_percent}%), \ + {selected_edges}/{total_edges} edges ({edges_percent}%), \ + {selected_temporal_edges}/{total_temporal_edges} temporal edges ({temporal_edges_percent}%)" + ); + + println!( + "Starting filtered SF1 materialize using RecordBatches at {}", + Local::now() + ); + let recordbatch_start = Instant::now(); + let recordbatch_graph = materialize_impl( + &filtered, + Some(&rb_materialize_graph_path), + g.core_graph().extension().config().clone(), + ) + .unwrap(); + let recordbatch_elapsed = recordbatch_start.elapsed(); + println!( + "Finished filtered SF1 materialize using RecordBatches at {}\nTook {recordbatch_elapsed:?}", + Local::now() + ); + + println!( + "Starting filtered SF1 materialize impl (old) at {}", + Local::now() + ); + let impl_start = Instant::now(); + let materialize_impl_graph = filtered + .materialize_at(&old_materialize_graph_path) + .unwrap(); + let impl_elapsed = impl_start.elapsed(); + println!( + "Finished filtered SF1 materialize impl (old) at {}\nTook {impl_elapsed:?}", + Local::now() + ); + + println!("Checking RecordBatch materialized graph"); + assert_graph_equal_timestamps(&filtered, &recordbatch_graph); + println!("Passed!\nChecking old materialized graph"); + assert_graph_equal_timestamps(&filtered, &materialize_impl_graph); + println!("Passed!"); + + println!( + "Filtered SF1 parity check passed.\n materialize_using_recordbatches: {:?}\n materialize_impl: {:?}", + recordbatch_elapsed, impl_elapsed + ); + remove_dir_all_ignore_not_found(&old_materialize_graph_path).unwrap(); + remove_dir_all_ignore_not_found(&rb_materialize_graph_path).unwrap(); +} + +#[cfg(feature = "io")] +fn get_new_materialize_time(graph_path: &Path, materialize_graph_path: &Path) -> Duration { + remove_dir_all_ignore_not_found(&materialize_graph_path).unwrap(); + fs::create_dir_all(&materialize_graph_path).unwrap(); + + if !graph_path.exists() { + panic!("SNB graph not found at {}", graph_path.display()); + } + + println!("Loading SF10 SNB graph from {}", graph_path.display()); + let sf10_graph = Graph::load(&graph_path).unwrap(); + println!( + "Loaded SF10 source graph: {} nodes, {} edges, {} temporal edges", + sf10_graph.count_nodes(), + sf10_graph.count_edges(), + sf10_graph.count_temporal_edges() + ); + + println!( + "Starting SF10 materialize using RecordBatches at {}", + Local::now() + ); + let recordbatch_start = Instant::now(); + let _recordbatch_graph = materialize_impl( + &sf10_graph, + Some(&materialize_graph_path), + sf10_graph.core_graph().extension().config().clone(), + ) + .unwrap(); + let recordbatch_elapsed = recordbatch_start.elapsed(); + println!( + "Finished SF10 materialize using RecordBatches at {}\nTook {recordbatch_elapsed:?}", + Local::now() + ); + drop(_recordbatch_graph); + drop(sf10_graph); + // free up disk space for next test + remove_dir_all_ignore_not_found(&materialize_graph_path).unwrap(); + recordbatch_elapsed +} + +#[cfg(feature = "io")] +fn get_parquet_decode_time( + graph_path: &Path, + parquet_path: &Path, + decode_graph_path: &Path, +) -> Duration { + remove_dir_all_ignore_not_found(&decode_graph_path).unwrap(); + fs::create_dir_all(&decode_graph_path).unwrap(); + + if !graph_path.exists() { + panic!("SNB graph not found at {}", graph_path.display()); + } + if !parquet_path.exists() { + panic!( + "SNB parquet directory not found at {}", + parquet_path.display() + ); + } + + println!("Loading SF10 SNB graph from {}", graph_path.display()); + let sf10_graph = Graph::load(&graph_path).unwrap(); + println!( + "Loaded SF10 source graph in: {} nodes, {} edges, {} temporal edges", + sf10_graph.count_nodes(), + sf10_graph.count_edges(), + sf10_graph.count_temporal_edges() + ); + let sf10_extension_config = sf10_graph.core_graph().extension().config().clone(); + drop(sf10_graph); + + println!("Starting SF10 decode_parquet at {}", Local::now()); + let parquet_decode_start = Instant::now(); + let _parquet_graph = MaterializedGraph::decode_parquet( + &parquet_path, + Some(&decode_graph_path), + sf10_extension_config, + ) + .unwrap(); + let parquet_decode_elapsed = parquet_decode_start.elapsed(); + println!( + "Finished SF10 decode_parquet at {}\nTook {parquet_decode_elapsed:?}", + Local::now() + ); + drop(_parquet_graph); + // free up disk space for next test + remove_dir_all_ignore_not_found(&decode_graph_path).unwrap(); + parquet_decode_elapsed +} + +// FIXME: Is there a way to safely import these from parquet/mod.rs? +const RAP_NODE_ID_COL: &str = "rap_node_id"; +const RAP_NODE_VID_COL: &str = "rap_node_vid"; +const RAP_NODE_TYPE_COL: &str = "rap_node_type"; +const RAP_NODE_TYPE_ID_COL: &str = "rap_node_type_id"; +const RAP_TIME_COL: &str = "rap_time"; +const RAP_SECONDARY_INDEX_COL: &str = "rap_secondary_index"; +const RAP_SRC_ID_COL: &str = "rap_src_id"; +const RAP_DST_ID_COL: &str = "rap_dst_id"; +const RAP_SRC_VID_COL: &str = "rap_src_vid"; +const RAP_DST_VID_COL: &str = "rap_dst_vid"; +const RAP_EDGE_ID_COL: &str = "rap_edge_id"; +const RAP_LAYER_COL: &str = "rap_layer"; +const RAP_LAYER_ID_COL: &str = "rap_layer_id"; +const GRAPH_C_PARQUET_DIR: &str = "graph_c"; +const GRAPH_T_PARQUET_DIR: &str = "graph_t"; +const NODES_C_PARQUET_DIR: &str = "nodes_c"; +const NODES_T_PARQUET_DIR: &str = "nodes_t"; +const EDGES_T_PARQUET_DIR: &str = "edges_t"; +const EDGES_D_PARQUET_DIR: &str = "edges_d"; +const EDGES_C_PARQUET_DIR: &str = "edges_c"; + +#[cfg(feature = "io")] +fn parquet_prop_columns(path: &Path, exclude: &[&str]) -> Vec { + get_parquet_file_paths(path) + .unwrap() + .into_iter() + .next() + .map(|file| { + ArrowReaderMetadata::load(&fs::File::open(file).unwrap(), Default::default()) + .unwrap() + .schema() + .fields() + .iter() + .map(|field| field.name().to_string()) + .filter(|name| !exclude.iter().any(|excluded| excluded == name)) + .collect() + }) + .unwrap_or_default() +} + +#[cfg(feature = "io")] +fn get_parquet_encode_time(graph_path: &Path, parquet_graph_path: &Path) -> Duration { + remove_dir_all_ignore_not_found(&parquet_graph_path).unwrap(); + fs::create_dir_all(&parquet_graph_path).unwrap(); + + if !graph_path.exists() { + panic!("SNB graph not found at {}", graph_path.display()); + } + + println!("Loading SF10 SNB graph from {}", graph_path.display()); + let sf10_graph = Graph::load(&graph_path).unwrap(); + println!( + "Loaded SF10 source graph: {} nodes, {} edges, {} temporal edges", + sf10_graph.count_nodes(), + sf10_graph.count_edges(), + sf10_graph.count_temporal_edges() + ); + + println!("Starting SF10 encode_parquet at {}", Local::now()); + let parquet_dump_start = Instant::now(); + sf10_graph.encode_parquet(parquet_graph_path).unwrap(); + let parquet_dump_elapsed = parquet_dump_start.elapsed(); + println!( + "Finished SF10 encode_parquet at {}\nTook {parquet_dump_elapsed:?}", + Local::now() + ); + + parquet_dump_elapsed +} + +#[cfg(feature = "io")] +fn get_parquet_df_loader_time( + graph_path: &Path, + parquet_path: &Path, + load_graph_path: &Path, +) -> Duration { + remove_dir_all_ignore_not_found(&load_graph_path).unwrap(); + fs::create_dir_all(&load_graph_path).unwrap(); + + if !graph_path.exists() { + panic!("SNB graph not found at {}", graph_path.display()); + } + if !parquet_path.exists() { + panic!( + "SNB parquet graph directory not found at {}", + parquet_path.display() + ); + } + + println!("Loading SF10 SNB graph from {}", graph_path.display()); + let sf10_graph = Graph::load(&graph_path).unwrap(); + println!( + "Loaded SF10 source graph: {} nodes, {} edges, {} temporal edges", + sf10_graph.count_nodes(), + sf10_graph.count_edges(), + sf10_graph.count_temporal_edges() + ); + let sf10_extension_config = sf10_graph.core_graph().extension().config().clone(); + drop(sf10_graph); + + let replay_graph = + Graph::new_at_path_with_config(load_graph_path, sf10_extension_config).unwrap(); + println!("Starting SF10 parquet loader replay at {}", Local::now()); + let parquet_load_start = Instant::now(); + + let c_graph_path = parquet_path.join(GRAPH_C_PARQUET_DIR); + if c_graph_path.exists() { + let graph_c_metadata = parquet_prop_columns(&c_graph_path, &[RAP_TIME_COL]); + let graph_c_metadata = graph_c_metadata + .iter() + .map(String::as_str) + .collect::>(); + let graph_c_start = Instant::now(); + load_graph_props_from_parquet( + &replay_graph, + &c_graph_path, + RAP_TIME_COL, + None, + &[], + &graph_c_metadata, + None, + None, + ) + .unwrap(); + println!( + "GraphC loaded at {}\nTook {:?}", + Local::now(), + graph_c_start.elapsed() + ); + } + + let t_graph_path = parquet_path.join(GRAPH_T_PARQUET_DIR); + if t_graph_path.exists() { + let graph_t_props = + parquet_prop_columns(&t_graph_path, &[RAP_TIME_COL, RAP_SECONDARY_INDEX_COL]); + let graph_t_props = graph_t_props.iter().map(String::as_str).collect::>(); + let graph_t_start = Instant::now(); + load_graph_props_from_parquet( + &replay_graph, + &t_graph_path, + RAP_TIME_COL, + Some(RAP_SECONDARY_INDEX_COL), + &graph_t_props, + &[], + None, + None, + ) + .unwrap(); + println!( + "GraphT loaded at {}\nTook {:?}", + Local::now(), + graph_t_start.elapsed() + ); + } + + let c_node_path = parquet_path.join(NODES_C_PARQUET_DIR); + if c_node_path.exists() { + let node_c_metadata = parquet_prop_columns( + &c_node_path, + &[ + RAP_NODE_ID_COL, + RAP_NODE_VID_COL, + RAP_NODE_TYPE_COL, + RAP_NODE_TYPE_ID_COL, + ], + ); + let node_c_metadata = node_c_metadata + .iter() + .map(String::as_str) + .collect::>(); + let nodes_c_start = Instant::now(); + load_node_metadata_from_parquet( + &replay_graph, + &c_node_path, + RAP_NODE_ID_COL, + None, + Some(RAP_NODE_TYPE_COL), + Some(RAP_NODE_VID_COL), + Some(RAP_NODE_TYPE_ID_COL), + &node_c_metadata, + None, + None, + None, + ) + .unwrap(); + println!( + "NodesC loaded at {}\nTook {:?}", + Local::now(), + nodes_c_start.elapsed() + ); + } + + let t_node_path = parquet_path.join(NODES_T_PARQUET_DIR); + if t_node_path.exists() { + let node_t_props = parquet_prop_columns( + &t_node_path, + &[ + RAP_NODE_ID_COL, + RAP_NODE_VID_COL, + RAP_NODE_TYPE_COL, + RAP_TIME_COL, + RAP_SECONDARY_INDEX_COL, + ], + ); + let node_t_props = node_t_props.iter().map(String::as_str).collect::>(); + let nodes_t_start = Instant::now(); + load_nodes_from_parquet( + &replay_graph, + &t_node_path, + RAP_TIME_COL, + Some(RAP_SECONDARY_INDEX_COL), + RAP_NODE_VID_COL, + None, + None, + &node_t_props, + &[], + None, + None, + None, + None, + false, + None, + ) + .unwrap(); + println!( + "NodesT loaded at {}\nTook {:?}", + Local::now(), + nodes_t_start.elapsed() + ); + } + + let t_edge_path = parquet_path.join(EDGES_T_PARQUET_DIR); + if t_edge_path.exists() { + let edge_t_props = parquet_prop_columns( + &t_edge_path, + &[ + RAP_TIME_COL, + RAP_SECONDARY_INDEX_COL, + RAP_SRC_VID_COL, + RAP_SRC_ID_COL, + RAP_DST_VID_COL, + RAP_DST_ID_COL, + RAP_LAYER_COL, + RAP_LAYER_ID_COL, + RAP_EDGE_ID_COL, + ], + ); + let edge_t_props = edge_t_props.iter().map(String::as_str).collect::>(); + let edges_t_start = Instant::now(); + load_edges_from_parquet( + &replay_graph, + &t_edge_path, + ColumnNames::new( + RAP_TIME_COL, + Some(RAP_SECONDARY_INDEX_COL), + RAP_SRC_VID_COL, + RAP_DST_VID_COL, + Some(RAP_LAYER_COL), + ) + .with_layer_id_col(RAP_LAYER_ID_COL) + .with_edge_id_col(RAP_EDGE_ID_COL), + false, + &edge_t_props, + &[], + None, + None, + None, + None, + ) + .unwrap(); + println!( + "EdgesT loaded at {}\nTook {:?}", + Local::now(), + edges_t_start.elapsed() + ); + } + + let d_edge_path = parquet_path.join(EDGES_D_PARQUET_DIR); + if d_edge_path.exists() { + let edges_d_start = Instant::now(); + load_edge_deletions_from_parquet( + &replay_graph, + &d_edge_path, + ColumnNames::new( + RAP_TIME_COL, + Some(RAP_SECONDARY_INDEX_COL), + RAP_SRC_VID_COL, + RAP_DST_VID_COL, + Some(RAP_LAYER_COL), + ) + .with_layer_id_col(RAP_LAYER_ID_COL) + .with_edge_id_col(RAP_EDGE_ID_COL), + None, + false, + None, + None, + ) + .unwrap(); + println!( + "EdgesD loaded at {}\nTook {:?}", + Local::now(), + edges_d_start.elapsed() + ); + } + + let c_edge_path = parquet_path.join(EDGES_C_PARQUET_DIR); + if c_edge_path.exists() { + let edge_c_metadata = parquet_prop_columns( + &c_edge_path, + &[ + RAP_SRC_VID_COL, + RAP_SRC_ID_COL, + RAP_DST_VID_COL, + RAP_DST_ID_COL, + RAP_LAYER_COL, + RAP_EDGE_ID_COL, + ], + ); + let edge_c_metadata = edge_c_metadata + .iter() + .map(String::as_str) + .collect::>(); + let edges_c_start = Instant::now(); + load_edge_metadata_from_parquet( + &replay_graph, + &c_edge_path, + RAP_SRC_VID_COL, + RAP_DST_VID_COL, + &edge_c_metadata, + None, + None, + Some(RAP_LAYER_COL), + None, + None, + false, + ) + .unwrap(); + println!( + "EdgesC loaded at {}\nTook {:?}", + Local::now(), + edges_c_start.elapsed() + ); + } + + let parquet_load_elapsed = parquet_load_start.elapsed(); + println!( + "Finished SF10 parquet loader replay at {}\nLoaded graph: {} nodes, {} edges, {} temporal edges\nTook {parquet_load_elapsed:?}", + Local::now(), + replay_graph.count_nodes(), + replay_graph.count_edges(), + replay_graph.count_temporal_edges(), + ); + drop(replay_graph); + remove_dir_all_ignore_not_found(&load_graph_path).unwrap(); + + parquet_load_elapsed +} + +#[cfg(feature = "io")] +#[test] +#[ignore = "requires locally persisted SNB SF10 graphs and parquet export"] +fn test_current() { + let graph_path = default_sf10_graph_path(); + let parquet_path = default_sf10_parquet_path(); + let parquet_loader_graph_path = default_materialized_graphs_path().join("parquet_loader_sf10"); + let parquet_decode_graph_path = default_materialized_graphs_path().join("parquet_decode_sf10"); + let materialize_graph_path = default_materialized_graphs_path().join("rb_materialize_sf10"); + + let materialize_duration = get_new_materialize_time(&graph_path, &materialize_graph_path); + + let parquet_dump_duration = get_parquet_encode_time(&graph_path, &parquet_path); + + let parquet_loader_duration = + get_parquet_df_loader_time(&graph_path, &parquet_path, &parquet_loader_graph_path); + + let parquet_decode_duration = + get_parquet_decode_time(&graph_path, &parquet_path, &parquet_decode_graph_path); + + println!( + "Summary:\n encode_parquet: {:?}\n parquet loaders replay: {:?}\n decode_parquet: {:?}\n materialize_using_recordbatches: {:?}", + parquet_dump_duration, + parquet_loader_duration, + parquet_decode_duration, + materialize_duration + ); +}