Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ arrow-json = "57.1.0"
arrow-schema = { version = "57.1.0", features = ["serde"] }
arrow-select = "57.1.0"
datafusion = "51.0.0"
datafusion-proto = "51.0.0"
object_store = { version = "0.12.4", features = [
"cloud",
"aws",
Expand Down
6 changes: 6 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,12 @@ pub mod validator;
use std::time::Duration;

// Public re-exports of crates being used in enterprise
pub use arrow_array;
pub use arrow_flight;
pub use arrow_ipc;
pub use catalog as parseable_catalog;
pub use datafusion;
pub use datafusion_proto;
pub use handlers::http::modal::{
ParseableServer, ingest_server::IngestServer, query_server::QueryServer, server::Server,
};
Expand All @@ -65,6 +70,7 @@ pub use openid;
pub use opentelemetry_proto;
use parseable::PARSEABLE;
use reqwest::{Client, ClientBuilder};
pub use utils as parseable_utils;

// It is very unlikely that panic will occur when dealing with locks.
pub const LOCK_EXPECT: &str = "Thread shouldn't panic while holding a lock";
Expand Down
86 changes: 54 additions & 32 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use arrow_schema::SchemaRef;
use chrono::NaiveDateTime;
use chrono::{DateTime, Duration, Utc};
use datafusion::arrow::record_batch::RecordBatch;
use datafusion::catalog::SchemaProvider;
use datafusion::common::tree_node::Transformed;
use datafusion::execution::disk_manager::DiskManager;
use datafusion::execution::{
Expand All @@ -45,7 +46,7 @@ use datafusion::sql::sqlparser::dialect::PostgreSqlDialect;
use futures::Stream;
use futures::stream::select_all;
use itertools::Itertools;
use once_cell::sync::Lazy;
use once_cell::sync::{Lazy, OnceCell};
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};
use std::ops::Bound;
Expand All @@ -57,7 +58,6 @@ use sysinfo::System;
use tokio::runtime::Runtime;

use self::error::ExecuteError;
use self::stream_schema_provider::GlobalSchemaProvider;
pub use self::stream_schema_provider::PartialTimeFilter;
use crate::alerts::alert_structs::Conditions;
use crate::alerts::alerts_utils::get_filter_string;
Expand All @@ -70,11 +70,11 @@ use crate::handlers::http::query::QueryError;
use crate::metrics::increment_bytes_scanned_in_query_by_date;
use crate::option::Mode;
use crate::parseable::{DEFAULT_TENANT, PARSEABLE};
use crate::storage::{ObjectStorageProvider, ObjectStoreFormat};
use crate::query::stream_schema_provider::GlobalSchemaProvider;
use crate::storage::{ObjectStorage, ObjectStorageProvider, ObjectStoreFormat};
use crate::utils::time::TimeRange;

// pub static QUERY_SESSION: Lazy<SessionContext> =
// Lazy::new(|| Query::create_session_context(PARSEABLE.storage()));
pub static SCHEMA_PROVIDER: OnceCell<Box<dyn ParseableSchemaProvider>> = OnceCell::new();

pub static QUERY_SESSION_STATE: Lazy<SessionState> =
Lazy::new(|| Query::create_session_state(PARSEABLE.storage()));
Expand All @@ -90,6 +90,15 @@ pub static QUERY_SESSION: Lazy<InMemorySessionContext> = Lazy::new(|| {
}
});

/// Trait to enable implementation of SchemaProvider
pub trait ParseableSchemaProvider: Send + Sync {
fn new_provider(
&self,
storage: Option<Arc<dyn ObjectStorage>>,
tenant_id: &Option<String>,
) -> Box<dyn SchemaProvider>;
}

pub struct InMemorySessionContext {
session_context: Arc<RwLock<SessionContext>>,
}
Expand All @@ -104,18 +113,23 @@ impl InMemorySessionContext {
}

pub fn add_schema(&self, tenant_id: &str) {
let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
provider.new_provider(
Some(PARSEABLE.storage().get_object_store()),
&Some(tenant_id.to_owned()),
)
} else {
Box::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: Some(tenant_id.to_owned()),
})
};
self.session_context
.write()
.expect("SessionContext should be writeable")
.catalog("datafusion")
.expect("Default catalog should be available")
.register_schema(
tenant_id,
Arc::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: Some(tenant_id.to_owned()),
}),
)
.register_schema(tenant_id, schema_provider.into())
.expect("Should be able to register new schema");
}

Expand Down Expand Up @@ -194,29 +208,41 @@ impl Query {
// register multiple schemas
if let Some(tenants) = PARSEABLE.list_tenants() {
for t in tenants.iter() {
let schema_provider = Arc::new(GlobalSchemaProvider {
storage: storage.get_object_store(),
tenant_id: Some(t.clone()),
});
let _ = catalog.register_schema(t, schema_provider);
let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
provider.new_provider(
Some(PARSEABLE.storage().get_object_store()),
&Some(t.to_owned()),
)
} else {
Box::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: Some(t.to_owned()),
})
};
let _ = catalog.register_schema(t, schema_provider.into());
}
}
} else {
// register just one schema
let schema_provider = Arc::new(GlobalSchemaProvider {
storage: storage.get_object_store(),
tenant_id: None,
});
let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
provider.new_provider(Some(PARSEABLE.storage().get_object_store()), &None)
} else {
Box::new(GlobalSchemaProvider {
storage: PARSEABLE.storage().get_object_store(),
tenant_id: None,
})
};

let _ = catalog.register_schema(
&state.config_options().catalog.default_schema,
schema_provider,
schema_provider.into(),
);
Comment on lines +211 to 239
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Pass the caller's storage into the schema provider.

create_session_context(storage) accepts an ObjectStorageProvider, but both branches still hand PARSEABLE.storage().get_object_store() to the provider. Any caller creating a context over a different storage backend will register schemas against the wrong object store.

Suggested fix
 pub fn create_session_context(storage: Arc<dyn ObjectStorageProvider>) -> SessionContext {
     let state = Self::create_session_state(storage.clone());
+    let object_store = storage.get_object_store();

     let catalog = state
         .catalog_list()
         .catalog(&state.config_options().catalog.default_catalog)
         .expect("default catalog is provided by datafusion");

     if PARSEABLE.options.is_multi_tenant() {
         if let Some(tenants) = PARSEABLE.list_tenants() {
             for t in tenants.iter() {
                 let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
                     provider.new_provider(
-                        Some(PARSEABLE.storage().get_object_store()),
+                        Some(object_store.clone()),
                         &Some(t.to_owned()),
                     )
                 } else {
                     Box::new(GlobalSchemaProvider {
-                        storage: PARSEABLE.storage().get_object_store(),
+                        storage: object_store.clone(),
                         tenant_id: Some(t.to_owned()),
                     })
                 };
                 let _ = catalog.register_schema(t, schema_provider.into());
             }
         }
     } else {
         let schema_provider = if let Some(provider) = SCHEMA_PROVIDER.get() {
-            provider.new_provider(Some(PARSEABLE.storage().get_object_store()), &None)
+            provider.new_provider(Some(object_store.clone()), &None)
         } else {
             Box::new(GlobalSchemaProvider {
-                storage: PARSEABLE.storage().get_object_store(),
+                storage: object_store,
                 tenant_id: None,
             })
         };

         let _ = catalog.register_schema(
             &state.config_options().catalog.default_schema,
             schema_provider.into(),
         );
     }

     SessionContext::new_with_state(state)
 }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/query/mod.rs` around lines 211 - 239, The schema provider registration is
using PARSEABLE.storage().get_object_store() instead of the storage passed into
create_session_context(storage), so schemas get registered against the wrong
backend; update both branches where SCHEMA_PROVIDER.new_provider and
GlobalSchemaProvider are constructed to use the caller's storage (the function
parameter named storage) by passing Some(storage.get_object_store()) or
storage.get_object_store() and tenant_id as before, and then call
catalog.register_schema as-is so the catalog is registered against the provided
storage rather than PARSEABLE.storage().

}

SessionContext::new_with_state(state)
}

fn create_session_state(storage: Arc<dyn ObjectStorageProvider>) -> SessionState {
pub fn create_session_state(storage: Arc<dyn ObjectStorageProvider>) -> SessionState {
let runtime_config = storage
.get_datafusion_runtime()
.with_disk_manager_builder(DiskManager::builder());
Expand Down Expand Up @@ -303,8 +329,8 @@ impl Query {
),
ExecuteError,
> {
let df = QUERY_SESSION
.get_ctx()
let ctx = QUERY_SESSION.get_ctx();
let df = ctx
.execute_logical_plan(self.final_logical_plan(tenant_id))
.await?;
let tenant = tenant_id.as_deref().unwrap_or(DEFAULT_TENANT);
Expand All @@ -320,14 +346,10 @@ impl Query {
return Ok((Either::Left(vec![]), fields));
}

let plan = QUERY_SESSION
.get_ctx()
.state()
.create_physical_plan(df.logical_plan())
.await?;
let plan = ctx.state().create_physical_plan(df.logical_plan()).await?;

let results = if !is_streaming {
let task_ctx = QUERY_SESSION.get_ctx().task_ctx();
let task_ctx = ctx.task_ctx();

let batches = collect_partitioned(plan.clone(), task_ctx.clone())
.await?
Expand All @@ -343,7 +365,7 @@ impl Query {

Either::Left(batches)
} else {
let task_ctx = QUERY_SESSION.get_ctx().task_ctx();
let task_ctx = ctx.task_ctx();

let output_partitions = plan.output_partitioning().partition_count();

Expand Down
2 changes: 1 addition & 1 deletion src/query/stream_schema_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ pub enum PartialTimeFilter {
}

impl PartialTimeFilter {
fn try_from_expr(expr: &Expr, time_partition: &Option<String>) -> Option<Self> {
pub fn try_from_expr(expr: &Expr, time_partition: &Option<String>) -> Option<Self> {
let Expr::BinaryExpr(binexpr) = expr else {
return None;
};
Expand Down
Loading