diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/configuration/Configuration.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/configuration/Configuration.java index eec8ce16d..ba4103cd7 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/configuration/Configuration.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/configuration/Configuration.java @@ -5,25 +5,68 @@ import java.io.Serializable; import java.util.Arrays; +/** + * Serializable, typed view over a Flink {@link ParameterTool} that exposes a Dagger job's + * configuration. + * + *

It wraps the parameters supplied at job submission and provides typed accessors (string, + * string array, integer, boolean, long) with optional defaults, so the rest of the codebase can + * read configuration keys without dealing with parsing. Being {@link Serializable}, the + * configuration can be captured by operators and shipped with the Flink job graph. + */ public class Configuration implements Serializable { + /** The underlying Flink parameter source backing all lookups. */ private final ParameterTool param; + /** + * Wraps the given Flink parameters. + * + * @param param the parameter source holding the job's configuration key/value pairs + */ public Configuration(ParameterTool param) { this.param = param; } + /** + * Returns the underlying Flink parameter source. + * + * @return the wrapped {@link ParameterTool} + */ public ParameterTool getParam() { return param; } + /** + * Returns the value of a configuration key as a string. + * + * @param configKey the configuration key to look up + * @return the configured value, or {@code null} if the key is absent + */ public String getString(String configKey) { return param.get(configKey); } + /** + * Returns the value of a configuration key as a string, or a default when absent. + * + * @param configKey the configuration key to look up + * @param defaultValue the value to return when the key is not present + * @return the configured value, or {@code defaultValue} if the key is absent + */ public String getString(String configKey, String defaultValue) { return param.get(configKey, defaultValue); } + /** + * Returns the value of a configuration key split into a trimmed string array. + * + *

The raw value is split on commas with each element trimmed. When the key is missing or its + * value is blank, the supplied default array is returned instead. + * + * @param configKey the configuration key to look up + * @param defaultValue the array to return when the key is absent or blank + * @return the parsed comma-separated values, or {@code defaultValue} if none are present + */ public String[] getStringArray(String configKey, String[] defaultValue) { String value = param.get(configKey); if (value == null || value.trim().isEmpty()) { @@ -33,14 +76,35 @@ public String[] getStringArray(String configKey, String[] defaultValue) { return Arrays.stream(value.split(",")).map(String::trim).toArray(String[]::new); } + /** + * Returns the value of a configuration key as an integer, or a default when absent. + * + * @param configKey the configuration key to look up + * @param defaultValue the value to return when the key is not present + * @return the configured integer, or {@code defaultValue} if the key is absent + */ public Integer getInteger(String configKey, Integer defaultValue) { return param.getInt(configKey, defaultValue); } + /** + * Returns the value of a configuration key as a boolean, or a default when absent. + * + * @param configKey the configuration key to look up + * @param defaultValue the value to return when the key is not present + * @return the configured boolean, or {@code defaultValue} if the key is absent + */ public Boolean getBoolean(String configKey, Boolean defaultValue) { return param.getBoolean(configKey, defaultValue); } + /** + * Returns the value of a configuration key as a long, or a default when absent. + * + * @param configKey the configuration key to look up + * @param defaultValue the value to return when the key is not present + * @return the configured long, or {@code defaultValue} if the key is absent + */ public Long getLong(String configKey, Long defaultValue) { return param.getLong(configKey, defaultValue); } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Constants.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Constants.java index d1ab88b6f..ab3125e9c 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Constants.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Constants.java @@ -1,33 +1,67 @@ package com.gotocompany.dagger.common.core; +/** + * Centralized constants shared across Dagger's common module. + * + *

Holds the configuration keys (and their default values) for the Stencil schema-registry + * client, a handful of telemetry/UDF identifiers, stream input-schema configuration keys, and the + * internal column names Dagger appends to every deserialized {@link org.apache.flink.types.Row}. + * This is a constant holder and is not intended to be instantiated. + */ public class Constants { + /** Configuration key toggling use of the remote Stencil schema registry. */ public static final String SCHEMA_REGISTRY_STENCIL_ENABLE_KEY = "SCHEMA_REGISTRY_STENCIL_ENABLE"; + /** Default for {@link #SCHEMA_REGISTRY_STENCIL_ENABLE_KEY}: remote Stencil disabled. */ public static final boolean SCHEMA_REGISTRY_STENCIL_ENABLE_DEFAULT = false; + /** Configuration key holding the comma-separated Stencil registry URLs. */ public static final String SCHEMA_REGISTRY_STENCIL_URLS_KEY = "SCHEMA_REGISTRY_STENCIL_URLS"; + /** Default for {@link #SCHEMA_REGISTRY_STENCIL_URLS_KEY}: no URLs configured. */ public static final String SCHEMA_REGISTRY_STENCIL_URLS_DEFAULT = ""; + /** Configuration key for the Stencil descriptor fetch timeout, in milliseconds. */ public static final String SCHEMA_REGISTRY_STENCIL_FETCH_TIMEOUT_MS = "SCHEMA_REGISTRY_STENCIL_FETCH_TIMEOUT_MS"; + /** Default Stencil fetch timeout: {@code 10000} ms (10 seconds). */ public static final Integer SCHEMA_REGISTRY_STENCIL_FETCH_TIMEOUT_MS_DEFAULT = 10000; + /** Configuration key for the comma-separated {@code name:value} HTTP headers sent to Stencil. */ public static final String SCHEMA_REGISTRY_STENCIL_FETCH_HEADERS_KEY = "SCHEMA_REGISTRY_STENCIL_FETCH_HEADERS"; + /** Default for {@link #SCHEMA_REGISTRY_STENCIL_FETCH_HEADERS_KEY}: no headers. */ public static final String SCHEMA_REGISTRY_STENCIL_FETCH_HEADERS_DEFAULT = ""; + /** Configuration key toggling automatic refresh of the Stencil descriptor cache. */ public static final String SCHEMA_REGISTRY_STENCIL_CACHE_AUTO_REFRESH_KEY = "SCHEMA_REGISTRY_STENCIL_CACHE_AUTO_REFRESH"; + /** Default for {@link #SCHEMA_REGISTRY_STENCIL_CACHE_AUTO_REFRESH_KEY}: auto-refresh disabled. */ public static final boolean SCHEMA_REGISTRY_STENCIL_CACHE_AUTO_REFRESH_DEFAULT = false; + /** Configuration key for the Stencil descriptor cache time-to-live, in milliseconds. */ public static final String SCHEMA_REGISTRY_STENCIL_CACHE_TTL_MS_KEY = "SCHEMA_REGISTRY_STENCIL_CACHE_TTL_MS"; + /** Default Stencil cache TTL: {@code 900000} ms (15 minutes). */ public static final Long SCHEMA_REGISTRY_STENCIL_CACHE_TTL_MS_DEFAULT = 900000L; + /** Configuration key selecting the Stencil schema refresh strategy. */ public static final String SCHEMA_REGISTRY_STENCIL_REFRESH_STRATEGY_KEY = "SCHEMA_REGISTRY_STENCIL_REFRESH_STRATEGY"; + /** Default Stencil refresh strategy: {@code "LONG_POLLING"}. */ public static final String SCHEMA_REGISTRY_STENCIL_REFRESH_STRATEGY_DEFAULT = "LONG_POLLING"; + /** Configuration key for the minimum back-off between Stencil fetch retries, in milliseconds. */ public static final String SCHEMA_REGISTRY_STENCIL_FETCH_BACKOFF_MIN_MS_KEY = "SCHEMA_REGISTRY_STENCIL_FETCH_BACKOFF_MIN_MS"; + /** Default minimum Stencil fetch back-off: {@code 60000} ms (1 minute). */ public static final Long SCHEMA_REGISTRY_STENCIL_FETCH_BACKOFF_MIN_MS_DEFAULT = 60000L; + /** Configuration key for the number of times a failed Stencil fetch is retried. */ public static final String SCHEMA_REGISTRY_STENCIL_FETCH_RETRIES_KEY = "SCHEMA_REGISTRY_STENCIL_FETCH_RETRIES"; + /** Default number of Stencil fetch retries: {@code 4}. */ public static final Integer SCHEMA_REGISTRY_STENCIL_FETCH_RETRIES_DEFAULT = 4; + /** Metric group key under which user-defined function (UDF) telemetry is reported. */ public static final String UDF_TELEMETRY_GROUP_KEY = "udf"; + /** Aspect/field name used when publishing gauge metric values. */ public static final String GAUGE_ASPECT_NAME = "value"; + /** Length of the sliding time window used when aggregating/reporting metrics. */ public static final long SLIDING_TIME_WINDOW = 10; + /** Stream configuration key naming the protobuf class for an input stream's schema. */ public static final String STREAM_INPUT_SCHEMA_PROTO_CLASS = "INPUT_SCHEMA_PROTO_CLASS"; + /** Stream configuration key naming the table/alias for an input stream. */ public static final String STREAM_INPUT_SCHEMA_TABLE = "INPUT_SCHEMA_TABLE"; + /** Configuration key holding the definition of the job's input streams. */ public static final String INPUT_STREAMS = "STREAMS"; + /** Name of the internal boolean column Dagger appends to flag whether a record parsed successfully. */ public static final String INTERNAL_VALIDATION_FIELD_KEY = "__internal_validation_field__"; + /** Default name of the event-time (rowtime) attribute column appended to every row. */ public static final String ROWTIME = "rowtime"; } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/DaggerContext.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/DaggerContext.java index 79c8eaa2b..af4350aba 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/DaggerContext.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/DaggerContext.java @@ -13,10 +13,25 @@ * It initializes with StreamExecutionEnvironment, StreamTableEnvironment and Configuration. */ public class DaggerContext { + /** + * Logger used to record lifecycle events of the {@link DaggerContext} singleton. + */ private static final Logger LOGGER = LoggerFactory.getLogger(DaggerContext.class.getName()); + /** + * The lazily-created, {@code volatile} singleton instance shared across the Dagger job. + */ private static volatile DaggerContext daggerContext = null; + /** + * The Flink {@link StreamExecutionEnvironment} that backs the streaming job. + */ private final StreamExecutionEnvironment executionEnvironment; + /** + * The Flink {@link StreamTableEnvironment} used to evaluate the Table/SQL pipeline. + */ private final StreamTableEnvironment tableEnvironment; + /** + * The user-supplied {@link Configuration} that parameterizes the Dagger job. + */ private final Configuration configuration; /** @@ -55,14 +70,29 @@ public static synchronized DaggerContext init(Configuration configuration) { return daggerContext; } + /** + * Returns the Flink {@link StreamExecutionEnvironment} held by this context. + * + * @return the stream execution environment + */ public StreamExecutionEnvironment getExecutionEnvironment() { return executionEnvironment; } + /** + * Returns the Flink {@link StreamTableEnvironment} held by this context. + * + * @return the stream table environment + */ public StreamTableEnvironment getTableEnvironment() { return tableEnvironment; } + /** + * Returns the {@link Configuration} that was used to initialize this context. + * + * @return the configuration + */ public Configuration getConfiguration() { return configuration; } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/FieldDescriptorCache.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/FieldDescriptorCache.java index 8b265dc87..c4b255617 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/FieldDescriptorCache.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/FieldDescriptorCache.java @@ -8,15 +8,42 @@ import java.util.Map; +/** + * Serializable cache of protobuf field positions, used to keep field indices stable when the + * Stencil schema is refreshed at runtime. + * + *

On construction it walks a protobuf {@link Descriptors.Descriptor} recursively (descending into + * nested {@code MESSAGE} fields) and records, for every field, its original declared index keyed by + * fully-qualified name, plus the field count (arity) of every message type keyed by fully-qualified + * name. The protobuf and Parquet deserializers consult this cache so that rows are built against the + * field layout captured at startup rather than against a possibly-reordered refreshed descriptor. + */ public class FieldDescriptorCache implements Serializable { + /** Maps each field's fully-qualified name to its original declared index within its message. */ private final Map fieldDescriptorIndexMap = new HashMap<>(); + /** Maps each message type's fully-qualified name to its original field count (arity). */ private final Map protoDescriptorArityMap = new HashMap<>(); + /** + * Builds a cache by recursively indexing the given descriptor and all nested message types. + * + * @param descriptor the root protobuf descriptor to index + */ public FieldDescriptorCache(Descriptors.Descriptor descriptor) { cacheFieldDescriptorMap(descriptor); } + /** + * Recursively records field indices and message arities for the given descriptor. + * + *

Returns early if this message type has already been cached, which both avoids repeated work + * and guards against recursive or self-referential schemas. For every field it stores the + * field's original index, and for nested {@code MESSAGE} fields it recurses into the referenced + * message type. + * + * @param descriptor the protobuf descriptor whose fields should be cached + */ public void cacheFieldDescriptorMap(Descriptors.Descriptor descriptor) { if (protoDescriptorArityMap.containsKey(descriptor.getFullName())) { @@ -37,6 +64,13 @@ public void cacheFieldDescriptorMap(Descriptors.Descriptor descriptor) { } } + /** + * Returns the cached original index of a protobuf field. + * + * @param fieldDescriptor the field whose original index is requested + * @return the field's original declared index captured when it was cached + * @throws IllegalArgumentException if the field is not present in the cache + */ public int getOriginalFieldIndex(Descriptors.FieldDescriptor fieldDescriptor) { if (!fieldDescriptorIndexMap.containsKey(fieldDescriptor.getFullName())) { throw new IllegalArgumentException("The Field Descriptor " + fieldDescriptor.getFullName() + " was not found in the cache"); @@ -44,11 +78,24 @@ public int getOriginalFieldIndex(Descriptors.FieldDescriptor fieldDescriptor) { return fieldDescriptorIndexMap.get(fieldDescriptor.getFullName()); } + /** + * Indicates whether a field (by fully-qualified name) is present in the cache. + * + * @param fieldName the fully-qualified field name to look up + * @return {@code true} if the field has been cached, {@code false} otherwise + */ public boolean containsField(String fieldName) { return fieldDescriptorIndexMap.containsKey(fieldName); } + /** + * Returns the cached original field count (arity) of a protobuf message type. + * + * @param descriptor the message descriptor whose original field count is requested + * @return the number of fields the message declared when it was cached + * @throws IllegalArgumentException if the descriptor is not present in the cache + */ public int getOriginalFieldCount(Descriptors.Descriptor descriptor) { if (!protoDescriptorArityMap.containsKey(descriptor.getFullName())) { throw new IllegalArgumentException("The Proto Descriptor " + descriptor.getFullName() + " was not found in the cache"); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StencilClientOrchestrator.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StencilClientOrchestrator.java index f182e66f1..cfcbe84eb 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StencilClientOrchestrator.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StencilClientOrchestrator.java @@ -23,9 +23,21 @@ * The Stencil client orchestrator for dagger. */ public class StencilClientOrchestrator implements Serializable { + /** + * The process-wide {@link StencilClient} cached and shared across orchestrator instances. + */ private static StencilClient stencilClient; + /** + * Logger used to report invalid header configuration and other diagnostics. + */ private static final Logger LOGGER = LoggerFactory.getLogger(StencilClientOrchestrator.class); + /** + * The Dagger {@link Configuration} from which Stencil settings are read. + */ private Configuration configuration; + /** + * The de-duplicated set of Stencil (schema registry) URLs to fetch descriptors from. + */ private HashSet stencilUrls; /** @@ -38,6 +50,12 @@ public StencilClientOrchestrator(Configuration configuration) { this.stencilUrls = getStencilUrls(); } + /** + * Builds a {@link StencilConfig} from the Dagger {@link Configuration}, wiring fetch headers, + * timeouts, cache behaviour, refresh strategy and retry/backoff settings. + * + * @return the assembled Stencil configuration + */ public StencilConfig createStencilConfig() { return StencilConfig.builder() .fetchHeaders(getHeaders(configuration)) @@ -50,6 +68,16 @@ public StencilConfig createStencilConfig() { .build(); } + /** + * Resolves the {@link SchemaRefreshStrategy} to use for the Stencil cache. + * + *

Returns a version-based refresh strategy when {@code refreshStrategy} equals + * {@code "VERSION_BASED_REFRESH"} (case-insensitive); otherwise it falls back to the + * long-polling strategy, including when {@code refreshStrategy} is {@code null}. + * + * @param refreshStrategy the configured refresh-strategy name, may be {@code null} + * @return the matching schema refresh strategy + */ private SchemaRefreshStrategy getSchemaRefreshStrategy(String refreshStrategy) { if (refreshStrategy == null) { return SchemaRefreshStrategy.longPollingStrategy(); @@ -61,6 +89,12 @@ private SchemaRefreshStrategy getSchemaRefreshStrategy(String refreshStrategy) { } + /** + * Reads the configured fetch-header string and parses it into HTTP headers. + * + * @param config the configuration to read the header string from + * @return the parsed list of {@code Header} objects; empty when none are configured + */ private List

getHeaders(Configuration config) { String headerString = config.getString(SCHEMA_REGISTRY_STENCIL_FETCH_HEADERS_KEY, SCHEMA_REGISTRY_STENCIL_FETCH_HEADERS_DEFAULT); return parseHeaders(headerString); @@ -97,6 +131,16 @@ public StencilClient enrichStencilClient(List additionalStencilUrls) { return stencilClient; } + /** + * Creates a {@link StencilClient} for the given URLs. + * + *

When remote Stencil is enabled in the configuration a registry-backed client is built + * from {@code urls} and the {@link StencilConfig}; otherwise a default in-classpath client + * is returned. + * + * @param urls the Stencil registry URLs to fetch descriptors from + * @return the initialized Stencil client + */ private StencilClient initStencilClient(List urls) { StencilConfig stencilConfig = createStencilConfig(); boolean enableRemoteStencil = configuration.getBoolean(SCHEMA_REGISTRY_STENCIL_ENABLE_KEY, SCHEMA_REGISTRY_STENCIL_ENABLE_DEFAULT); @@ -105,6 +149,13 @@ private StencilClient initStencilClient(List urls) { : StencilClientFactory.getClient(); } + /** + * Splits a comma-separated header string into individual, validated HTTP headers. + * + * @param headersString the raw {@code key:value} header pairs separated by commas, + * treated as empty when {@code null} + * @return the list of valid parsed headers; invalid entries are skipped + */ private List

parseHeaders(String headersString) { headersString = headersString == null ? "" : headersString; return Arrays.stream(headersString.split(",")) @@ -114,6 +165,15 @@ private List
parseHeaders(String headersString) { .collect(Collectors.toList()); } + /** + * Checks whether a single header entry is well-formed, i.e. it contains exactly one + * non-empty key and one non-empty value separated by a colon. + * + *

A non-empty but malformed entry is logged and treated as invalid. + * + * @param headerString the trimmed {@code key:value} entry to validate + * @return {@code true} if the entry has a valid key and value, otherwise {@code false} + */ private Boolean isValidHeader(String headerString) { Boolean isValid = Arrays.stream(headerString.split(":")).map(String::trim).filter(a -> !a.isEmpty()).count() == 2; if (!isValid && !headerString.isEmpty()) { @@ -122,11 +182,22 @@ private Boolean isValidHeader(String headerString) { return isValid; } + /** + * Converts a single {@code key:value} entry into a {@link BasicHeader}. + * + * @param headerString the entry to parse; expected to contain a colon separator + * @return the header built from the trimmed key and value + */ private BasicHeader parseHeader(String headerString) { String[] split = headerString.split(":"); return new BasicHeader(split[0].trim(), split[1].trim()); } + /** + * Reads the configured Stencil URLs and collects them into a de-duplicated set. + * + * @return the set of trimmed, unique Stencil registry URLs + */ private HashSet getStencilUrls() { stencilUrls = Arrays.stream(configuration.getString(SCHEMA_REGISTRY_STENCIL_URLS_KEY, SCHEMA_REGISTRY_STENCIL_URLS_DEFAULT).split(",")) .map(String::trim) diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StreamInfo.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StreamInfo.java index 7b87b9fb4..f55d717f1 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StreamInfo.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/StreamInfo.java @@ -7,7 +7,13 @@ * The class to hold the data stream and column names. */ public class StreamInfo { + /** + * The underlying Flink data stream of {@link Row} records. + */ private DataStream dataStream; + /** + * The column names describing the schema of each {@link Row} in the stream. + */ private String[] columnNames; /** diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Transformer.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Transformer.java index 5f31d33a5..fe9db85c7 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Transformer.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/core/Transformer.java @@ -4,5 +4,14 @@ * The interface for all the transformer. */ public interface Transformer { + /** + * Applies this transformation to the given {@link StreamInfo} and returns the result. + * + *

Implementations typically derive a new {@code DataStream} and/or column layout + * from the input and wrap them in the returned {@link StreamInfo}. + * + * @param streamInfo the input stream and its column metadata to transform + * @return the transformed stream information + */ StreamInfo transform(StreamInfo streamInfo); } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/DescriptorNotFoundException.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/DescriptorNotFoundException.java index f249d2639..e483180ec 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/DescriptorNotFoundException.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/DescriptorNotFoundException.java @@ -4,6 +4,7 @@ * The class Exception if Descriptor not found. */ public class DescriptorNotFoundException extends RuntimeException { + /** Default detail message used when no specific descriptor error message is supplied. */ public static final String DESCRIPTOR_NOT_FOUND = "descriptor not found"; /** diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/DaggerDeserializationException.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/DaggerDeserializationException.java index 4baddb397..3be5eccaf 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/DaggerDeserializationException.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/DaggerDeserializationException.java @@ -13,6 +13,11 @@ public DaggerDeserializationException(Exception innerException) { super(innerException); } + /** + * Instantiates a new Dagger deserialization exception with the specified detail message. + * + * @param message the detail message describing the deserialization failure + */ public DaggerDeserializationException(String message) { super(message); } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/InvalidJSONSchemaException.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/InvalidJSONSchemaException.java index a3def3374..e7750fbb1 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/InvalidJSONSchemaException.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/InvalidJSONSchemaException.java @@ -1,6 +1,19 @@ package com.gotocompany.dagger.common.exceptions.serde; +/** + * Unchecked exception thrown when a configured JSON schema cannot be parsed or is structurally + * invalid. + * + *

Dagger can derive types and field mappings from a JSON schema when consuming or producing + * JSON-encoded data. If that schema is malformed or cannot be interpreted, this exception wraps the + * underlying parsing failure so the job fails fast instead of proceeding with an unusable schema. + */ public class InvalidJSONSchemaException extends RuntimeException { + /** + * Creates a new exception that wraps the underlying cause of the schema parsing failure. + * + * @param innerException the exception thrown while reading or parsing the JSON schema + */ public InvalidJSONSchemaException(Exception innerException) { super(innerException); } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/SimpleGroupParsingException.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/SimpleGroupParsingException.java index e2b162605..cf2084c48 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/SimpleGroupParsingException.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/exceptions/serde/SimpleGroupParsingException.java @@ -5,6 +5,11 @@ **/ public class SimpleGroupParsingException extends RuntimeException { + /** + * Instantiates a new Simple group parsing exception with the specified detail message. + * + * @param message the detail message describing why the field could not be parsed + */ public SimpleGroupParsingException(String message) { super(message); } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/AspectType.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/AspectType.java index 47b19499b..ccf09a197 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/AspectType.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/AspectType.java @@ -4,8 +4,12 @@ * The enum Aspect type. */ public enum AspectType { + /** Aspect reported as a point-in-time gauge value. */ Gauge, + /** Aspect reported as a distribution of values via a histogram. */ Histogram, + /** Aspect reported as a metered rate of events. */ Metric, + /** Aspect reported as a monotonically increasing counter. */ Counter } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/Aspects.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/Aspects.java index 6d66ef558..38a3fea74 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/Aspects.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/aspects/Aspects.java @@ -4,7 +4,17 @@ * The interface for aspects. */ public interface Aspects { + /** + * Returns the metric name used when registering this aspect with a metric group. + * + * @return the metric name of the aspect + */ String getValue(); + /** + * Returns the kind of metric this aspect represents. + * + * @return the {@link AspectType} of the aspect + */ AspectType getAspectType(); } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/CounterStatsManager.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/CounterStatsManager.java index 46f9dede1..006bb48ac 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/CounterStatsManager.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/CounterStatsManager.java @@ -12,7 +12,9 @@ * The Counter stats manager. */ public class CounterStatsManager { + /** The Flink metric group under which counters are registered. */ private MetricGroup metricGroup; + /** Registered counters keyed by the aspect they measure. */ private Map counters = new HashMap<>(); /** diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/GaugeStatsManager.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/GaugeStatsManager.java index ddcde1f91..2b3a08139 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/GaugeStatsManager.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/GaugeStatsManager.java @@ -9,7 +9,9 @@ */ public class GaugeStatsManager { + /** Whether gauge registration is enabled; when {@code false} all register calls are no-ops. */ private final Boolean enabled; + /** The Flink metric group under which gauges are registered. */ private final MetricGroup metricGroup; /** diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/MeterStatsManager.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/MeterStatsManager.java index 968267639..3986cb018 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/MeterStatsManager.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/metrics/managers/MeterStatsManager.java @@ -20,9 +20,13 @@ * The Meter stats manager. */ public class MeterStatsManager { + /** Histograms keyed by aspect, used to record value distributions over a sliding time window. */ private final HashMap histogramMap; + /** Whether metric registration and updates are enabled; when {@code false} calls are no-ops. */ private Boolean enabled; + /** Meters keyed by aspect, used to record event rates. */ private HashMap meterMap; + /** The Flink metric group under which histograms and meters are registered. */ private MetricGroup metricGroup; /** @@ -65,6 +69,12 @@ public void register(String groupName, Aspects[] aspects) { } } + /** + * Creates a Dropwizard-compatible histogram backed by a sliding time window reservoir. + * + * @return a new {@code com.codahale.metrics.Histogram} that retains samples for the + * configured {@code SLIDING_TIME_WINDOW} number of seconds + */ private com.codahale.metrics.Histogram getHistogram() { return new com.codahale.metrics.Histogram(new SlidingTimeWindowReservoir(SLIDING_TIME_WINDOW, TimeUnit.SECONDS)); } @@ -105,6 +115,16 @@ public void register(String groupKey, String groupValue, Aspects[] aspects) { } } + /** + * Registers the given aspects against the supplied metric group. + * + *

For each aspect, a Dropwizard-backed histogram is created when its + * {@link AspectType} is {@link AspectType#Histogram}, and a meter is created when it is + * {@link AspectType#Metric}; aspects of any other type are ignored. + * + * @param group the Flink metric group to register the metrics under + * @param aspects the aspects to register + */ private void register(MetricGroup group, Aspects[] aspects) { for (Aspects aspect : aspects) { if (AspectType.Histogram.equals(aspect.getAspectType())) { diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerDeserializer.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerDeserializer.java index e24cf608e..395489997 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerDeserializer.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerDeserializer.java @@ -4,6 +4,24 @@ import java.io.Serializable; +/** + * Common contract implemented by every Dagger source deserializer. + * + *

A {@code DaggerDeserializer} turns raw bytes consumed from a source (for example a Kafka + * record value or a Parquet {@code SimpleGroup}) into a Flink {@link org.apache.flink.types.Row} + * that subsequently flows through the streaming job. Concrete implementations include the + * protobuf, JSON and Parquet deserializers, all of which carry this marker so they can be wired + * into Dagger sources interchangeably. + * + *

The interface deliberately combines two concerns required by Flink. It extends + * {@link Serializable} so the deserializer can be shipped to task managers as part of the + * serialized job graph, and it extends {@link ResultTypeQueryable} so Flink can statically query + * the {@code TypeInformation} of the records this deserializer produces (needed for the SQL + * planner and for state/serializer selection). + * + * @param the element type emitted by the deserializer, typically + * {@link org.apache.flink.types.Row} + */ public interface DaggerDeserializer extends Serializable, ResultTypeQueryable { } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerInternalTypeInformation.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerInternalTypeInformation.java index 2dd563df6..2c794bcde 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerInternalTypeInformation.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DaggerInternalTypeInformation.java @@ -10,9 +10,43 @@ import java.util.ArrayList; import java.util.Arrays; +/** + * Contract for source schema adapters that expose the Flink {@link Row} type produced by a Dagger + * input stream and that append Dagger's internal trailing columns to it. + * + *

Implementations such as {@code ProtoType} and {@code JsonType} translate an external schema + * (a protobuf descriptor or a JSON schema string) into a Flink {@code TypeInformation}. On top + * of the user-visible columns, every row emitted by a Dagger deserializer carries two extra + * trailing fields used internally by the pipeline; {@link #addInternalFields} centralizes how + * those columns are added so all input formats stay consistent. + */ public interface DaggerInternalTypeInformation { + + /** + * Builds the complete Flink row type for this stream, including Dagger's internal trailing + * columns. + * + * @return the {@code TypeInformation} describing the {@link Row} emitted by the + * corresponding deserializer, with the validation flag and rowtime columns appended + */ TypeInformation getRowType(); + /** + * Appends Dagger's two internal trailing columns to an existing row type. + * + *

The supplied {@code initialTypeInfo} (which is expected to be a {@link RowTypeInfo} + * describing the user-facing columns) is extended with two fields, in this exact order: a + * boolean validation flag stored under {@link Constants#INTERNAL_VALIDATION_FIELD_KEY} that + * records whether the source record was parsed successfully, followed by an event-time column + * named {@code rowtimeAttributeName} of type {@code SQL_TIMESTAMP} that Flink uses as the + * rowtime attribute for watermarking and windowing. + * + * @param initialTypeInfo the row type describing the user-visible columns; cast to + * {@link RowTypeInfo} to read its field names and types + * @param rowtimeAttributeName the name to assign to the appended event-time (rowtime) column + * @return a new named {@code TypeInformation} containing the original fields followed by + * the boolean validation flag and the rowtime timestamp column + */ default TypeInformation addInternalFields(TypeInformation initialTypeInfo, String rowtimeAttributeName) { RowTypeInfo rowTypeInfo = (RowTypeInfo) initialTypeInfo; ArrayList fieldNames = new ArrayList<>(Arrays.asList(rowTypeInfo.getFieldNames())); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DataTypes.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DataTypes.java index 6be1bf8e2..6f80edd73 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DataTypes.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/DataTypes.java @@ -1,6 +1,15 @@ package com.gotocompany.dagger.common.serde; +/** + * Enumerates the wire formats Dagger can deserialize records from on an input stream. + * + *

The selected value is typically derived from stream configuration and used to choose the + * matching {@code DaggerDeserializer} implementation (a JSON-backed deserializer versus a + * protobuf-backed one) when a Dagger source is constructed. + */ public enum DataTypes { + /** Records encoded as JSON documents, deserialized against a configured JSON schema. */ JSON, + /** Records encoded as Protocol Buffers messages, deserialized via a Stencil descriptor. */ PROTO } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonDeserializer.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonDeserializer.java index 9bce511b6..6c8132699 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonDeserializer.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonDeserializer.java @@ -17,11 +17,35 @@ import static com.gotocompany.dagger.common.core.Constants.ROWTIME; +/** + * Kafka deserialization schema that converts JSON-encoded record values into Flink {@link Row} + * instances for a Dagger input stream. + * + *

Parsing is delegated to Flink's {@link JsonRowDeserializationSchema}, which is built from the + * row type produced by {@link JsonType} (the user-visible columns plus Dagger's internal trailing + * columns). After parsing, each row is post-processed so that the trailing validation flag is set + * and the rowtime column is populated with a {@link Timestamp} derived from the configured source + * field. Any parsing or conversion failure is surfaced as a {@link DaggerDeserializationException} + * so that Dagger sources can handle errors uniformly. + */ public class JsonDeserializer implements KafkaDeserializationSchema, DaggerDeserializer { + /** Backing Flink schema that parses JSON bytes into a {@link Row} matching {@link #typeInformation}. */ private final JsonRowDeserializationSchema jsonRowDeserializationSchema; + /** Index, within the produced row, of the field whose value supplies the event-time (rowtime). */ private final int rowtimeIdx; + /** Full produced {@link Row} type, including Dagger's internal validation and rowtime columns. */ private final TypeInformation typeInformation; + /** + * Instantiates a new JSON deserializer for a Dagger source. + * + *

Builds the produced row type from the JSON schema (appending Dagger's internal columns + * under the {@code ROWTIME} attribute name), constructs the backing Flink JSON schema from that + * type, and resolves the index of the field that carries the event time. + * + * @param jsonSchema the JSON schema string describing the stream's records + * @param rowtimeFieldName the name of the field whose value is used as the event-time (rowtime) + */ public JsonDeserializer(String jsonSchema, String rowtimeFieldName) { this.typeInformation = new JsonType(jsonSchema, ROWTIME).getRowType(); this.jsonRowDeserializationSchema = new JsonRowDeserializationSchema.Builder(typeInformation).build(); @@ -29,11 +53,32 @@ public JsonDeserializer(String jsonSchema, String rowtimeFieldName) { this.rowtimeIdx = rowTypeInfo.getFieldIndex(rowtimeFieldName); } + /** + * {@inheritDoc} + * + *

Dagger streams are unbounded, so this always returns {@code false}; no element is ever + * treated as an end-of-stream marker. + * + * @param nextElement the most recently deserialized row (ignored) + * @return {@code false} always + */ @Override public boolean isEndOfStream(Row nextElement) { return false; } + /** + * {@inheritDoc} + * + *

Parses the Kafka record's value bytes as JSON into a {@link Row} using the backing + * {@link JsonRowDeserializationSchema}, then populates Dagger's internal validation and rowtime + * columns via {@link #addTimestampFieldToRow(Row)}. + * + * @param consumerRecord the Kafka record whose value holds the JSON payload + * @return the deserialized row with its validation flag set and rowtime column populated + * @throws DaggerDeserializationException if the payload cannot be parsed, or the rowtime field + * carries an unsupported type + */ @Override public Row deserialize(ConsumerRecord consumerRecord) { try { @@ -44,11 +89,33 @@ public Row deserialize(ConsumerRecord consumerRecord) { } } + /** + * {@inheritDoc} + * + * @return the {@link Row} {@code TypeInformation} produced by the backing JSON schema, that is + * the user columns followed by Dagger's internal trailing columns + */ @Override public TypeInformation getProducedType() { return jsonRowDeserializationSchema.getProducedType(); } + /** + * Copies the user-visible columns of a freshly parsed row and fills in Dagger's internal + * trailing columns. + * + *

All fields except the final two are copied verbatim into a new {@link Row} of the same + * arity. The event-time value is read from {@link #rowtimeIdx}: a {@link BigDecimal} is + * interpreted as epoch seconds and converted with {@link Instant#ofEpochSecond(long)}, while an + * existing {@link Timestamp} is used as-is. The last column is then set to that timestamp and + * the second-to-last column (the validation flag) is set to {@code true}. + * + * @param row the row produced by the backing JSON schema + * @return a new row holding the user columns plus the populated validation flag and rowtime + * timestamp + * @throws DaggerDeserializationException if the rowtime field is neither a {@link BigDecimal} + * nor a {@link Timestamp} + */ private Row addTimestampFieldToRow(Row row) { Row finalRecord = new Row(row.getArity()); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonType.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonType.java index 53aea049b..f1d9955fc 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonType.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/json/deserialization/JsonType.java @@ -8,15 +8,43 @@ import java.io.Serializable; +/** + * Derives the Flink {@link Row} {@code TypeInformation} for a JSON-encoded input stream from its + * JSON schema. + * + *

The configured JSON schema string is converted into a named row type using Flink's + * {@link JsonRowSchemaConverter}, after which Dagger's internal trailing columns (the boolean + * validation flag and the rowtime timestamp) are appended through {@link #addInternalFields}. + * Instances are {@link Serializable} so the schema description can travel with the serialized job + * graph; the resulting type is consumed when constructing the JSON deserializer for a Kafka source. + */ public class JsonType implements Serializable, DaggerInternalTypeInformation { + /** JSON schema string (JSON-Schema syntax) describing the user-visible columns of the stream. */ private String jsonSchema; + /** Name to assign to the appended event-time (rowtime) column. */ private String rowtimeAttributeName; + /** + * Instantiates a new JSON type descriptor for a stream. + * + * @param jsonSchema the JSON schema string describing the stream's records + * @param rowtimeAttributeName the name of the event-time (rowtime) column to append + */ public JsonType(String jsonSchema, String rowtimeAttributeName) { this.jsonSchema = jsonSchema; this.rowtimeAttributeName = rowtimeAttributeName; } + /** + * {@inheritDoc} + * + *

Converts the configured JSON schema into a named row type via + * {@link JsonRowSchemaConverter#convert(String)} and then appends Dagger's internal validation + * and rowtime columns. + * + * @return the {@code TypeInformation} for the JSON stream, including the internal trailing + * columns + */ public TypeInformation getRowType() { TypeInformation rowNamed = JsonRowSchemaConverter.convert(jsonSchema); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/SimpleGroupValidation.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/SimpleGroupValidation.java index 07d5e18d4..0cbf36dcf 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/SimpleGroupValidation.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/SimpleGroupValidation.java @@ -9,7 +9,26 @@ import static org.apache.parquet.schema.Type.Repetition.REPEATED; import static org.apache.parquet.schema.Type.Repetition.REQUIRED; +/** + * Static helpers that inspect the schema of a Parquet {@link SimpleGroup} to determine how its + * fields are encoded. + * + *

Dagger reads Parquet records as {@link SimpleGroup} instances and converts them into Flink + * rows. Because map-typed fields may be written either with the legacy two-field repeated-group + * layout or with the standard Parquet {@code MAP} logical-type layout, these utilities let the + * deserializer detect which encoding is present (and whether a field is present and populated at + * all) before attempting to read values. The class exposes only static methods and is not intended + * to be instantiated. + */ public class SimpleGroupValidation { + /** + * Checks that a field is declared in the group's schema and actually carries at least one value. + * + * @param simpleGroup the Parquet group to inspect + * @param fieldName the name of the field to look for + * @return {@code true} if the schema contains {@code fieldName} and its repetition count is + * non-zero, {@code false} otherwise + */ public static boolean checkFieldExistsAndIsInitialized(SimpleGroup simpleGroup, String fieldName) { return simpleGroup.getType().containsField(fieldName) && simpleGroup.getFieldRepetitionCount(fieldName) != 0; } @@ -66,6 +85,18 @@ public static boolean checkIsStandardSimpleGroupMap(SimpleGroup simpleGroup, Str && applyNestedKeyValueFieldValidations(simpleGroup, fieldName); } + /** + * Validates the outer group of a candidate standard Parquet map field. + * + *

For the field to qualify it must be a {@link GroupType} whose repetition is {@code OPTIONAL} + * or {@code REQUIRED}, whose logical type annotation is {@link LogicalTypeAnnotation#mapType()}, + * and which has exactly one child (the nested {@code key_value} group). + * + * @param simpleGroup the Parquet group containing the candidate map field + * @param fieldName the name of the candidate map field + * @return {@code true} if the outer map group matches the standard specification, {@code false} + * otherwise (including when the field is not a group) + */ private static boolean applyMapFieldValidations(SimpleGroup simpleGroup, String fieldName) { Type mapType = simpleGroup.getType().getType(fieldName); if (mapType instanceof GroupType) { @@ -78,6 +109,18 @@ private static boolean applyMapFieldValidations(SimpleGroup simpleGroup, String return false; } + /** + * Validates the nested {@code key_value} group of a candidate standard Parquet map field. + * + *

Assumes the outer field is already known to be a group. The nested {@code key_value} child + * must exist, be a {@code REPEATED} {@link GroupType}, contain a {@code key} field, and that + * {@code key} must itself be {@code REQUIRED}, matching the Apache Parquet map specification. + * + * @param simpleGroup the Parquet group containing the map field + * @param fieldName the name of the map field + * @return {@code true} if the nested key/value structure matches the specification, + * {@code false} otherwise + */ private static boolean applyNestedKeyValueFieldValidations(SimpleGroup simpleGroup, String fieldName) { GroupType mapGroupType = simpleGroup.getType().getType(fieldName).asGroupType(); if (mapGroupType.containsField("key_value")) { diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/deserialization/SimpleGroupDeserializer.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/deserialization/SimpleGroupDeserializer.java index f9a0e49e9..51e877281 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/deserialization/SimpleGroupDeserializer.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/parquet/deserialization/SimpleGroupDeserializer.java @@ -15,12 +15,38 @@ import java.sql.Timestamp; import java.time.Instant; +/** + * Deserializes Parquet records (read as {@link SimpleGroup} instances) into Flink {@link Row} + * objects for a Dagger Parquet source. + * + *

Dagger's Parquet files are written from protobuf schemas, so the row layout is derived from a + * protobuf descriptor resolved by class name through the {@link StencilClientOrchestrator}. Each + * {@link SimpleGroup} is mapped column-by-column via {@link RowFactory}, leaving two trailing slots + * for Dagger's internal columns, which are then filled with a boolean validation flag and an + * event-time (rowtime) {@link Timestamp} extracted from the configured timestamp field. + */ public class SimpleGroupDeserializer implements DaggerDeserializer { + /** Fully-qualified protobuf class name whose descriptor defines the Parquet record schema. */ private final String protoClassName; + /** Proto field number of the timestamp field used to derive the event-time (rowtime). */ private final int timestampFieldIndex; + /** Resolves protobuf descriptors from the Stencil schema registry. */ private final StencilClientOrchestrator stencilClientOrchestrator; + /** Produced {@link Row} type, including Dagger's internal validation and rowtime columns. */ private final TypeInformation typeInformation; + /** + * Instantiates a new Parquet {@link SimpleGroup} deserializer. + * + *

Eagerly builds the produced row type from the protobuf schema via {@link ProtoType} so + * Flink can query it; descriptor resolution for actual deserialization happens lazily when + * {@link #deserialize(SimpleGroup)} is invoked. + * + * @param protoClassName the fully-qualified protobuf class name describing the schema + * @param timestampFieldIndex the proto field number of the timestamp field used for rowtime + * @param rowtimeAttributeName the name to assign to the appended event-time (rowtime) column + * @param stencilClientOrchestrator the orchestrator used to resolve protobuf descriptors + */ public SimpleGroupDeserializer(String protoClassName, int timestampFieldIndex, String rowtimeAttributeName, StencilClientOrchestrator stencilClientOrchestrator) { this.protoClassName = protoClassName; this.timestampFieldIndex = timestampFieldIndex; @@ -28,6 +54,13 @@ public SimpleGroupDeserializer(String protoClassName, int timestampFieldIndex, S this.typeInformation = new ProtoType(protoClassName, rowtimeAttributeName, stencilClientOrchestrator).getRowType(); } + /** + * Resolves the protobuf descriptor for {@link #protoClassName} from the Stencil registry. + * + * @return the descriptor describing the Parquet record schema + * @throws DescriptorNotFoundException if no descriptor is registered for the configured class + * name + */ private Descriptors.Descriptor getProtoParser() { Descriptors.Descriptor dsc = stencilClientOrchestrator.getStencilClient().get(protoClassName); if (dsc == null) { @@ -36,6 +69,17 @@ private Descriptors.Descriptor getProtoParser() { return dsc; } + /** + * Converts a single Parquet {@link SimpleGroup} into a Flink {@link Row}. + * + *

The row is built from the resolved descriptor with two extra trailing slots reserved for + * Dagger's internal columns, which are subsequently populated by + * {@link #addTimestampFieldToRow(Row, SimpleGroup, Descriptors.Descriptor)}. + * + * @param simpleGroup the Parquet record to convert + * @return the resulting row with its validation flag and rowtime timestamp populated + * @throws DaggerDeserializationException if conversion of the record fails + */ public Row deserialize(SimpleGroup simpleGroup) { Descriptors.Descriptor descriptor = getProtoParser(); try { @@ -46,6 +90,19 @@ public Row deserialize(SimpleGroup simpleGroup) { } } + /** + * Populates Dagger's internal trailing columns on an already-mapped Parquet row. + * + *

Reads the configured timestamp field (located by {@link #timestampFieldIndex}) using a + * {@link TimestampHandler}, which yields a two-element row of epoch seconds and nanoseconds. The + * second-to-last column (the validation flag) is set to {@code true} and the last column is set + * to the corresponding {@link Timestamp} built via {@link Instant#ofEpochSecond(long, long)}. + * + * @param row the row already populated with the user-visible columns + * @param simpleGroup the source Parquet record holding the timestamp field + * @param descriptor the protobuf descriptor used to locate the timestamp field by number + * @return the same row instance, with its validation flag and rowtime column populated + */ private Row addTimestampFieldToRow(Row row, SimpleGroup simpleGroup, Descriptors.Descriptor descriptor) { Descriptors.FieldDescriptor fieldDescriptor = descriptor.findFieldByNumber(timestampFieldIndex); TimestampHandler timestampHandler = new TimestampHandler(fieldDescriptor); @@ -58,6 +115,12 @@ private Row addTimestampFieldToRow(Row row, SimpleGroup simpleGroup, Descriptors return row; } + /** + * {@inheritDoc} + * + * @return the produced {@link Row} {@code TypeInformation}, including Dagger's internal trailing + * columns + */ @Override public TypeInformation getProducedType() { return this.typeInformation; diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoDeserializer.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoDeserializer.java index 6da14e54c..5a56507a2 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoDeserializer.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoDeserializer.java @@ -25,12 +25,33 @@ */ public class ProtoDeserializer implements KafkaDeserializationSchema, DaggerDeserializer { + /** + * Logger used to warn about null payloads and invalid protobuf records. + */ private static final Logger LOGGER = LoggerFactory.getLogger(ProtoDeserializer.class); + /** + * The fully-qualified protobuf class name used to resolve the message descriptor. + */ private final String protoClassName; + /** + * The field number of the protobuf timestamp field appended as the rowtime column. + */ private final int timestampFieldIndex; + /** + * The orchestrator used to obtain the Stencil client that resolves proto descriptors. + */ private final StencilClientOrchestrator stencilClientOrchestrator; + /** + * The Flink {@link TypeInformation} describing the {@link Row} produced by this deserializer. + */ private final TypeInformation typeInformation; + /** + * Cache of field descriptors used to build rows efficiently when schema auto-refresh is on. + */ private final FieldDescriptorCache fieldDescriptorCache; + /** + * Whether the Stencil cache auto-refresh is enabled, which selects the descriptor-cache row path. + */ private final boolean stencilAutoRefreshEnable; /** @@ -50,11 +71,33 @@ public ProtoDeserializer(String protoClassName, int timestampFieldIndex, String this.stencilAutoRefreshEnable = stencilClientOrchestrator.createStencilConfig().getCacheAutoRefresh(); } + /** + * {@inheritDoc} + * + *

This stream is unbounded, so the implementation always reports that the end of stream + * has not been reached. + * + * @param nextElement the most recently deserialized row + * @return {@code false} always, since the Kafka source is treated as never-ending + */ @Override public boolean isEndOfStream(Row nextElement) { return false; } + /** + * Deserializes a Kafka record into a Flink {@link Row}. + * + *

A {@code null} payload, or a record that fails protobuf parsing, yields a default + * "invalid" row (flagged as invalid with a zero timestamp) rather than failing the job; + * a successfully parsed message is converted and augmented with its rowtime timestamp. + * + * @param consumerRecord the Kafka record whose key and value byte arrays are read + * @return the deserialized row, or a default invalid row when the value is {@code null} + * or cannot be parsed as the expected protobuf message + * @throws DescriptorNotFoundException if the proto descriptor cannot be resolved + * @throws DaggerDeserializationException if an unexpected runtime error occurs while parsing + */ @Override public Row deserialize(ConsumerRecord consumerRecord) { Descriptors.Descriptor descriptor = getProtoParser(); @@ -76,11 +119,23 @@ public Row deserialize(ConsumerRecord consumerRecord) { } } + /** + * {@inheritDoc} + * + * @return the {@link TypeInformation} of the {@link Row} this deserializer produces + */ @Override public TypeInformation getProducedType() { return this.typeInformation; } + /** + * Resolves the protobuf message {@link Descriptors.Descriptor} for {@code protoClassName} + * from the Stencil client. + * + * @return the descriptor for the configured proto class + * @throws DescriptorNotFoundException if no descriptor is registered for {@code protoClassName} + */ private Descriptors.Descriptor getProtoParser() { Descriptors.Descriptor dsc = stencilClientOrchestrator.getStencilClient().get(protoClassName); if (dsc == null) { @@ -89,6 +144,15 @@ private Descriptors.Descriptor getProtoParser() { return dsc; } + /** + * Builds a placeholder {@link Row} for records that cannot be deserialized. + * + *

The row is created from the proto default instance with two extra trailing columns, + * the validity flag set to {@code false} and the rowtime set to epoch zero. + * + * @param defaultInstance the default protobuf message instance used to shape the row + * @return a row flagged as invalid with a zero timestamp + */ private Row createDefaultInvalidRow(DynamicMessage defaultInstance) { Row row; if (stencilAutoRefreshEnable) { @@ -101,6 +165,15 @@ private Row createDefaultInvalidRow(DynamicMessage defaultInstance) { return row; } + /** + * Converts a parsed protobuf message into a {@link Row} and appends rowtime metadata. + * + *

Two trailing columns are added: a validity flag set to {@code true} and a + * {@link Timestamp} derived from the seconds and nanos of the configured timestamp field. + * + * @param proto the successfully parsed protobuf message + * @return the row representation including the validity flag and event-time timestamp + */ private Row addTimestampFieldToRow(DynamicMessage proto) { Row finalRecord; if (stencilAutoRefreshEnable) { diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoType.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoType.java index 3e154a16e..14dadc439 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoType.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/deserialization/ProtoType.java @@ -17,9 +17,22 @@ */ public class ProtoType implements Serializable, DaggerInternalTypeInformation { + /** + * The cached, lazily-resolved protobuf {@link Descriptor} for {@code protoClassName}; + * marked {@code transient} because descriptors are not serializable. + */ private transient Descriptor protoFieldDescriptor; + /** + * The fully-qualified protobuf class name whose schema drives the row type. + */ private String protoClassName; + /** + * The name of the attribute that carries the Flink rowtime (event-time) field. + */ private String rowtimeAttributeName; + /** + * The orchestrator used to obtain the Stencil client that resolves proto descriptors. + */ private StencilClientOrchestrator stencilClientOrchestrator; /** @@ -45,6 +58,11 @@ public TypeInformation getRowType() { return addInternalFields(rowNamed, rowtimeAttributeName); } + /** + * Returns the protobuf {@link Descriptor}, resolving and caching it on first access. + * + * @return the proto field descriptor for {@code protoClassName} + */ private Descriptor getProtoFieldDescriptor() { if (protoFieldDescriptor == null) { protoFieldDescriptor = createFieldDescriptor(); @@ -52,6 +70,12 @@ private Descriptor getProtoFieldDescriptor() { return protoFieldDescriptor; } + /** + * Resolves the protobuf {@link Descriptor} for {@code protoClassName} via the Stencil client. + * + * @return the resolved descriptor + * @throws DescriptorNotFoundException if no descriptor is registered for {@code protoClassName} + */ private Descriptor createFieldDescriptor() { Descriptors.Descriptor dsc = stencilClientOrchestrator.getStencilClient().get(protoClassName); if (dsc == null) { diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/KafkaProtoSerializer.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/KafkaProtoSerializer.java index 3716b8daa..d698a83b0 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/KafkaProtoSerializer.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/KafkaProtoSerializer.java @@ -11,25 +11,76 @@ import java.util.Objects; +/** + * Kafka sink serialization schema that turns Flink {@link Row} records into protobuf-encoded + * {@link ProducerRecord}s for a Dagger Kafka sink. + * + *

The actual row-to-protobuf encoding is delegated to a {@link ProtoSerializer}, which produces + * the key and value byte arrays; this class wraps those bytes into a {@link ProducerRecord} aimed + * at the configured output topic. It implements Flink's {@code KafkaRecordSerializationSchema} so it + * can be plugged directly into a Flink {@code KafkaSink}. + */ public class KafkaProtoSerializer implements KafkaRecordSerializationSchema { + /** Kafka topic the serialized records are published to; must be non-empty when serializing. */ private final String outputTopic; + /** Delegate that encodes a {@link Row} into protobuf key and value byte arrays. */ private final ProtoSerializer protoSerializer; + /** Logger (named {@code "KafkaSink"}) used to trace rows being written to Kafka. */ private static final Logger LOGGER = LoggerFactory.getLogger("KafkaSink"); + /** + * Creates a serializer with an empty output topic, delegating to + * {@link #KafkaProtoSerializer(ProtoSerializer, String)}. + * + *

An output topic must be configured before records are written, because + * {@link #serialize(Row, KafkaSinkContext, Long)} rejects an empty topic with a + * {@link DaggerSerializationException}. + * + * @param protoSerializer the delegate that encodes rows into protobuf key/value bytes + */ public KafkaProtoSerializer(ProtoSerializer protoSerializer) { this(protoSerializer, ""); } + /** + * Creates a serializer targeting a specific Kafka topic. + * + * @param protoSerializer the delegate that encodes rows into protobuf key/value bytes + * @param outputTopic the Kafka topic to publish serialized records to + */ public KafkaProtoSerializer(ProtoSerializer protoSerializer, String outputTopic) { this.protoSerializer = protoSerializer; this.outputTopic = outputTopic; } + /** + * {@inheritDoc} + * + *

Delegates to the default {@link KafkaRecordSerializationSchema} initialization; this + * serializer holds no additional state that needs setting up. + * + * @param context the serialization initialization context + * @param sinkContext the Kafka sink context + * @throws Exception if the default initialization fails + */ @Override public void open(InitializationContext context, KafkaSinkContext sinkContext) throws Exception { KafkaRecordSerializationSchema.super.open(context, sinkContext); } + /** + * {@inheritDoc} + * + *

Encodes the given row into protobuf key and value byte arrays via the delegate + * {@link ProtoSerializer} and wraps them in a {@link ProducerRecord} for the configured output + * topic. The row being written is logged at info level. + * + * @param row the Flink row to serialize + * @param context the Kafka sink context (unused) + * @param timestamp the event timestamp supplied by Flink (unused) + * @return a {@link ProducerRecord} carrying the protobuf key and value for the output topic + * @throws DaggerSerializationException if no output topic has been configured + */ @Override public ProducerRecord serialize(Row row, KafkaSinkContext context, Long timestamp) { if (Objects.isNull(outputTopic) || outputTopic.equals("")) { diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/ProtoSerializer.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/ProtoSerializer.java index 1ddcf5e77..9d61aa00e 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/ProtoSerializer.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/proto/serialization/ProtoSerializer.java @@ -14,13 +14,42 @@ import java.util.Arrays; import java.util.Objects; +/** + * Serializes Flink {@link Row} records into protobuf byte arrays for Kafka keys and values. + * + *

Column names supplied at construction map row fields to protobuf fields (supporting nested + * fields via dot-separated names), and a {@link StencilClientOrchestrator} resolves the key and + * message descriptors from the schema registry. + */ public class ProtoSerializer implements Serializable { + /** + * The fully-qualified protobuf class name used to serialize the Kafka record key; + * may be {@code null} or empty when no key is produced. + */ private final String keyProtoClassName; + /** + * The row column names, positionally aligned with the fields of each {@link Row}. + */ private final String[] columnNames; + /** + * The orchestrator used to resolve protobuf descriptors from the Stencil registry. + */ private final StencilClientOrchestrator stencilClientOrchestrator; + /** + * The fully-qualified protobuf class name used to serialize the Kafka record value. + */ private final String messageProtoClassName; + /** + * Instantiates a new proto serializer. + * + * @param keyProtoClassName the protobuf class name for the record key, may be {@code null} or empty + * @param messageProtoClassName the protobuf class name for the record value; required and non-empty + * @param columnNames the column names mapping row fields to protobuf fields + * @param stencilClientOrchestrator the orchestrator used to resolve proto descriptors + * @throws DaggerSerializationException if {@code messageProtoClassName} is {@code null} or empty + */ public ProtoSerializer(String keyProtoClassName, String messageProtoClassName, String[] columnNames, StencilClientOrchestrator stencilClientOrchestrator) { this.keyProtoClassName = keyProtoClassName; this.columnNames = columnNames; @@ -29,6 +58,11 @@ public ProtoSerializer(String keyProtoClassName, String messageProtoClassName, S checkValidity(); } + /** + * Validates that a non-empty message proto class name was supplied. + * + * @throws DaggerSerializationException if {@code messageProtoClassName} is {@code null} or empty + */ private void checkValidity() { if (Objects.isNull(messageProtoClassName) || messageProtoClassName.isEmpty()) { throw new DaggerSerializationException("messageProtoClassName is required"); @@ -46,10 +80,26 @@ public byte[] serializeKey(Row row) { : parse(row, getDescriptor(keyProtoClassName)).toByteArray(); } + /** + * Serializes the value portion of a row into protobuf bytes. + * + * @param row the row to serialize using {@code messageProtoClassName} + * @return the serialized protobuf message as a byte array + */ public byte[] serializeValue(Row row) { return parse(row, getDescriptor(messageProtoClassName)).toByteArray(); } + /** + * Builds a {@link DynamicMessage} from a row using the given descriptor. + * + *

Each column is mapped onto the corresponding protobuf field; dot-separated column + * names are routed to nested message builders, while unknown top-level fields are skipped. + * + * @param element the row whose fields are written into the message + * @param descriptor the descriptor of the protobuf message being built + * @return the populated protobuf message + */ private DynamicMessage parse(Row element, Descriptors.Descriptor descriptor) { int numberOfElements = element.getArity(); DynamicMessage.Builder builder = DynamicMessage.newBuilder(descriptor); @@ -71,6 +121,13 @@ private DynamicMessage parse(Row element, Descriptors.Descriptor descriptor) { return builder.build(); } + /** + * Resolves the protobuf {@link Descriptors.Descriptor} for the given class name via Stencil. + * + * @param className the fully-qualified protobuf class name to resolve + * @return the descriptor for {@code className} + * @throws DescriptorNotFoundException if no descriptor is registered for {@code className} + */ private Descriptors.Descriptor getDescriptor(String className) { Descriptors.Descriptor dsc = stencilClientOrchestrator.getStencilClient().get(className); if (dsc == null) { @@ -79,6 +136,20 @@ private Descriptors.Descriptor getDescriptor(String className) { return dsc; } + /** + * Recursively populates a nested protobuf field addressed by a dot-separated column path. + * + *

The first element of {@code nestedColumnNames} selects a child field on + * {@code parentDescriptor}; the method descends into the child message builder until the + * leaf field is reached, then writes {@code data} into it. + * + * @param parentDescriptor the descriptor of the message currently being populated + * @param nestedColumnNames the remaining path segments identifying the target field + * @param parentBuilder the builder of the message currently being populated + * @param data the value to set on the leaf field + * @return the parent builder with the nested field populated + * @throws InvalidColumnMappingException if a path segment does not exist on the descriptor + */ private DynamicMessage.Builder populateNestedBuilder(Descriptors.Descriptor parentDescriptor, String[] nestedColumnNames, DynamicMessage.Builder parentBuilder, Object data) { String childColumnName = nestedColumnNames[0]; Descriptors.FieldDescriptor childFieldDescriptor = parentDescriptor.findFieldByName(childColumnName); @@ -95,6 +166,18 @@ private DynamicMessage.Builder populateNestedBuilder(Descriptors.Descriptor pare return parentBuilder; } + /** + * Sets a single, non-nested protobuf field on the given builder. + * + *

A {@code null} field descriptor or {@code null} data leaves the builder unchanged; + * otherwise the value is converted by the {@link TypeHandler} resolved for the field. + * + * @param builder the builder to populate + * @param fieldDescriptor the target protobuf field, may be {@code null} + * @param data the value to write, may be {@code null} + * @return the (possibly unchanged) builder + * @throws InvalidColumnMappingException if {@code data}'s type does not match the field type + */ private DynamicMessage.Builder populateBuilder(DynamicMessage.Builder builder, Descriptors.FieldDescriptor fieldDescriptor, Object data) { if (fieldDescriptor == null) { return builder; diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/PrimitiveTypeHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/PrimitiveTypeHandler.java index 43b904077..8ad899125 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/PrimitiveTypeHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/PrimitiveTypeHandler.java @@ -14,6 +14,10 @@ * The type Primitive proto handler. */ public class PrimitiveTypeHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} describing the primitive field this handler + * converts between its protobuf representation and the corresponding Flink type. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -25,21 +29,54 @@ public PrimitiveTypeHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Indicates that this handler can process the field. + * + *

{@code PrimitiveTypeHandler} is the fallback handler chosen by + * {@code TypeHandlerFactory} when no more specific handler matches, so it always + * reports that it can handle the field. + * + * @return {@code true}, always + */ @Override public boolean canHandle() { return true; } + /** + * Writes the given primitive value onto the supplied protobuf message builder. + * + *

When {@code field} is {@code null} the builder is returned untouched; otherwise the + * value is parsed into the descriptor's primitive type before being set. + * + * @param builder the dynamic message builder being populated + * @param field the Flink-side value to convert and set, or {@code null} to skip + * @return the same {@code builder}, with the field set when a non-null value was provided + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { return field != null ? builder.setField(fieldDescriptor, transform(field)) : builder; } + /** + * Converts a value coming from a post processor into the descriptor's primitive Java type. + * + * @param field the raw value emitted by an upstream post processor + * @return the parsed primitive value + */ @Override public Object transformFromPostProcessor(Object field) { return transform(field); } + /** + * Parses the raw value into the descriptor's primitive type using the matching + * {@code PrimitiveHandler}. + * + * @param field the raw value to parse + * @return the parsed primitive value + * @throws InvalidDataTypeException if the value cannot be parsed into the field's expected type + */ private Object transform(Object field) { PrimitiveHandler primitiveHandler = PrimitiveHandlerFactory.getTypeHandler(fieldDescriptor); try { @@ -50,27 +87,63 @@ private Object transform(Object field) { } } + /** + * Returns the primitive value read from a protobuf message unchanged. + * + *

Primitive protobuf values already map directly onto Flink types, so no conversion + * is required. + * + * @param field the value read from the protobuf message + * @return the same {@code field} value + */ @Override public Object transformFromProto(Object field) { return field; } + /** + * Returns the primitive protobuf value unchanged, ignoring the descriptor cache. + * + *

The {@code cache} is accepted for interface compatibility but is not needed for + * primitive fields, which require no nested descriptor lookups. + * + * @param field the value read from the protobuf message + * @param cache the field descriptor cache, unused for primitive fields + * @return the same {@code field} value + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return field; } + /** + * Reads the primitive value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the row being deserialized + * @return the parsed primitive value, or the type's default when the field is absent + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { PrimitiveHandler primitiveHandler = PrimitiveHandlerFactory.getTypeHandler(fieldDescriptor); return primitiveHandler.parseSimpleGroup(simpleGroup); } + /** + * Returns the primitive value unchanged for JSON serialization. + * + * @param field the primitive value to emit + * @return the same {@code field} value + */ @Override public Object transformToJson(Object field) { return field; } + /** + * Returns the Flink {@code TypeInformation} for this primitive field. + * + * @return the type information supplied by the matching {@code PrimitiveHandler} + */ @Override public TypeInformation getTypeInformation() { PrimitiveHandler primitiveHandler = PrimitiveHandlerFactory.getTypeHandler(fieldDescriptor); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/RowFactory.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/RowFactory.java index 790d707a7..198a90141 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/RowFactory.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/RowFactory.java @@ -81,6 +81,18 @@ public static Row createRow(DynamicMessage proto, int extraColumns, FieldDescrip } + /** + * Builds a Flink {@code Row} from a Parquet {@code SimpleGroup}, reserving extra trailing columns. + * + *

Each field declared by the descriptor is deserialized through its matching + * {@code TypeHandler} and placed at the field's index; the row is widened by + * {@code extraColumns} so callers can append derived columns afterwards. + * + * @param descriptor the protobuf descriptor describing the row layout + * @param simpleGroup the Parquet group holding the encoded record + * @param extraColumns the number of additional, initially empty columns to append + * @return the populated row + */ public static Row createRow(Descriptors.Descriptor descriptor, SimpleGroup simpleGroup, int extraColumns) { List descriptorFields = descriptor.getFields(); Row row = new Row(descriptorFields.size() + extraColumns); @@ -91,6 +103,13 @@ public static Row createRow(Descriptors.Descriptor descriptor, SimpleGroup simpl return row; } + /** + * Builds a Flink {@code Row} from a Parquet {@code SimpleGroup} with no extra columns. + * + * @param descriptor the protobuf descriptor describing the row layout + * @param simpleGroup the Parquet group holding the encoded record + * @return the populated row + */ public static Row createRow(Descriptors.Descriptor descriptor, SimpleGroup simpleGroup) { return createRow(descriptor, simpleGroup, 0); } diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/TypeHandlerFactory.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/TypeHandlerFactory.java index 852155260..0a5d84e53 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/TypeHandlerFactory.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/TypeHandlerFactory.java @@ -23,6 +23,12 @@ * The factory class for Type handler. */ public class TypeHandlerFactory { + /** + * Cache of previously resolved handlers, keyed by each field's fully qualified name. + * + *

Every entry pairs the field descriptor's hash code with its {@code TypeHandler} so a + * cached handler can be reused, while still being rebuilt whenever the descriptor changes. + */ private static Map> typeHandlerMap = new ConcurrentHashMap<>(); /** @@ -60,6 +66,16 @@ protected static void clearTypeHandlerMap() { typeHandlerMap.clear(); } + /** + * Builds the ordered list of candidate handlers for the given field. + * + *

Ordering is significant: {@code getTypeHandler} selects the first handler whose + * {@code canHandle()} returns {@code true}, so more specific handlers (maps, timestamps, + * enums, structs and repeated variants) are listed before the generic message handler. + * + * @param fieldDescriptor the field descriptor to build candidate handlers for + * @return the ordered list of candidate handlers to try + */ private static List getSpecificHandlers(Descriptors.FieldDescriptor fieldDescriptor) { return Arrays.asList( new MapHandler(fieldDescriptor), diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/EnumHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/EnumHandler.java index 0b0621cd6..ac0652707 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/EnumHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/EnumHandler.java @@ -15,6 +15,10 @@ * The type Enum proto handler. */ public class EnumHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} of the enum field this handler converts to and + * from its string name representation. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -26,11 +30,27 @@ public EnumHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a non-repeated protobuf {@code enum} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.ENUM && !fieldDescriptor.isRepeated(); } + /** + * Sets the enum field on the builder by resolving the value's name to an enum constant. + * + *

The incoming value is treated as the enum constant name (trimmed). When the handler + * cannot apply or {@code field} is {@code null}, the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the enum constant name to set, or {@code null} to skip + * @return the same {@code builder}, with the enum field set when resolvable + * @throws EnumFieldNotFoundException if the name does not match any constant of the enum + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { if (!canHandle() || field == null) { @@ -44,6 +64,15 @@ public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder bui return builder.setField(fieldDescriptor, valueByName); } + /** + * Resolves a post-processor value to a protobuf enum constant name. + * + *

The input may be either the enum's numeric position or its name; when it matches + * neither, the enum's zero-numbered (default) constant name is returned. + * + * @param field the value to resolve, defaulting to {@code "0"} when {@code null} + * @return the resolved enum constant name + */ @Override public Object transformFromPostProcessor(Object field) { String input = field != null ? field.toString() : "0"; @@ -57,16 +86,37 @@ public Object transformFromPostProcessor(Object field) { } } + /** + * Converts an enum value read from a protobuf message into its trimmed string name. + * + * @param field the enum value descriptor read from the message + * @return the enum constant name as a string + */ @Override public Object transformFromProto(Object field) { return String.valueOf(field).trim(); } + /** + * Converts the protobuf enum value into its trimmed string name, ignoring the cache. + * + * @param field the enum value descriptor read from the message + * @param cache the field descriptor cache, unused for enum fields + * @return the enum constant name as a string + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return String.valueOf(field).trim(); } + /** + * Reads the enum field from a Parquet {@code SimpleGroup} as its constant name. + * + *

Unknown or absent values fall back to the enum's zero-numbered default constant. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the resolved enum constant name, or the default constant name when missing + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { String defaultEnumValue = fieldDescriptor.getEnumType().findValueByNumber(0).getName(); @@ -79,11 +129,22 @@ public Object transformFromParquet(SimpleGroup simpleGroup) { return defaultEnumValue; } + /** + * Returns the enum constant name unchanged for JSON serialization. + * + * @param field the enum constant name + * @return the same {@code field} value + */ @Override public Object transformToJson(Object field) { return field; } + /** + * Returns the Flink {@code TypeInformation} used to represent this enum field. + * + * @return {@code Types.STRING}, since enum constants are represented by their name + */ @Override public TypeInformation getTypeInformation() { return Types.STRING; diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/GoogleProtobufComplexMessageHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/GoogleProtobufComplexMessageHandler.java index 9076d6b98..92f4e2933 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/GoogleProtobufComplexMessageHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/GoogleProtobufComplexMessageHandler.java @@ -30,6 +30,11 @@ */ public class GoogleProtobufComplexMessageHandler implements TypeHandler { + /** + * The set of fully qualified protobuf message names that this handler recognizes as + * dynamic, JSON-like complex types ({@code Struct}, {@code Value}, {@code ListValue} + * and {@code NullValue}). + */ private static final Set RECOGNIZED_COMPLEX_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( "google.protobuf.Struct", "google.protobuf.Value", @@ -37,18 +42,42 @@ public class GoogleProtobufComplexMessageHandler implements TypeHandler { "google.protobuf.NullValue" ))); + /** + * The protobuf {@code FieldDescriptor} of the complex message field handled here. + */ private final Descriptors.FieldDescriptor fieldDescriptor; + /** + * Instantiates a new handler for dynamic Google Protobuf complex message types. + * + * @param fieldDescriptor the descriptor of the complex message field to handle + */ public GoogleProtobufComplexMessageHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a message whose type is one of the recognized + * complex protobuf types + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.MESSAGE && RECOGNIZED_COMPLEX_TYPES.contains(fieldDescriptor.getMessageType().getFullName()); } + /** + * Serializes a recognized complex protobuf message into its raw byte-array form. + * + *

Because these {@code Struct}-family types are dynamic and recursive, they are stored + * as the message's serialized bytes and reconstructed later using the field descriptor. + * + * @param field the protobuf value read from the message; expected to be a {@code DynamicMessage} + * @return the serialized bytes of the message, or {@code null} when the value is empty, + * absent, or not a {@code DynamicMessage} + */ @Override public Object transformFromProto(Object field) { if (field == null) { @@ -65,11 +94,33 @@ public Object transformFromProto(Object field) { return null; } + /** + * Serializes the complex protobuf message to bytes, delegating to {@code transformFromProto}. + * + *

The descriptor cache is not required for these self-describing complex types. + * + * @param field the protobuf value read from the message + * @param cache the field descriptor cache, unused for this conversion + * @return the serialized bytes of the message, or {@code null} when there is no usable value + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return transformFromProto(field); } + /** + * Reconstructs a complex protobuf message from its byte-array form and sets it on the builder. + * + *

The {@code field} is expected to be the serialized bytes previously produced by + * {@code transformFromProto}; it is parsed back into a {@code DynamicMessage} using the + * field's message type. When the handler cannot apply or {@code field} is {@code null}, + * the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the serialized message bytes to parse and set, or {@code null} to skip + * @return the same {@code builder}, with the field set when bytes were provided + * @throws RuntimeException if the bytes cannot be parsed into the field's message type + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { if (!canHandle() || field == null) { @@ -85,21 +136,44 @@ public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder bui } } + /** + * Returns the post-processor value unchanged. + * + * @param field the value supplied by an upstream post processor + * @return the same {@code field} value + */ @Override public Object transformFromPostProcessor(Object field) { return field; } + /** + * Returns {@code null}, as these complex types are not read from Parquet sources. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return {@code null}, always + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { return null; } + /** + * Returns {@code null}, as these complex types are not emitted to JSON by this handler. + * + * @param field the value that would be serialized + * @return {@code null}, always + */ @Override public Object transformToJson(Object field) { return null; } + /** + * Returns the Flink {@code TypeInformation} used to represent this field. + * + * @return a primitive byte-array type, matching the serialized byte representation + */ @Override public TypeInformation getTypeInformation() { return Types.PRIMITIVE_ARRAY(Types.BYTE); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MapHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MapHandler.java index 266129138..6c4dccfe2 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MapHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MapHandler.java @@ -27,7 +27,13 @@ */ public class MapHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} of the map field being converted. + */ private Descriptors.FieldDescriptor fieldDescriptor; + /** + * Delegate handler that treats the map's entries as a repeated key/value message. + */ private TypeHandler repeatedMessageHandler; /** @@ -40,11 +46,27 @@ public MapHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.repeatedMessageHandler = new RepeatedMessageHandler(fieldDescriptor); } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a protobuf {@code map} field + */ @Override public boolean canHandle() { return fieldDescriptor.isMapField(); } + /** + * Sets the map field on the builder by encoding its entries as repeated key/value messages. + * + *

A {@code Map} input is first turned into rows of {@code (key, value)} pairs; any other + * input is passed straight to the underlying repeated-message handler. When the handler + * cannot apply or {@code field} is {@code null}, the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the map (or pre-built rows) to encode, or {@code null} to skip + * @return the same {@code builder}, with the map entries set when provided + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { if (!canHandle() || field == null) { @@ -61,6 +83,16 @@ public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder bui return repeatedMessageHandler.transformToProtoBuilder(builder, field); } + /** + * Converts a post-processor value into an array of key/value {@code Row} entries. + * + *

For a {@code Map} input, each entry's key and value are converted with their own + * handlers; a {@code List} input is delegated to the repeated-message handler. Any other + * input (including {@code null}) yields an empty array. + * + * @param field the map or list value emitted by an upstream post processor + * @return an array of two-field rows, one per map entry + */ @Override public Object transformFromPostProcessor(Object field) { ArrayList rows = new ArrayList<>(); @@ -85,16 +117,38 @@ public Object transformFromPostProcessor(Object field) { return rows.toArray(); } + /** + * Converts the map entries read from a protobuf message into key/value rows. + * + * @param field the repeated map-entry value read from the message + * @return an array of two-field rows, one per map entry + */ @Override public Object transformFromProto(Object field) { return repeatedMessageHandler.transformFromProto(field); } + /** + * Converts the protobuf map entries into key/value rows using the descriptor cache. + * + * @param field the repeated map-entry value read from the message + * @param cache the field descriptor cache used to resolve nested field indices + * @return an array of two-field rows, one per map entry + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return repeatedMessageHandler.transformFromProtoUsingCache(field, cache); } + /** + * Reads the map field from a Parquet {@code SimpleGroup} into key/value rows. + * + *

Both the legacy and the standard ({@code key_value}-wrapped) Parquet map encodings are + * supported; an empty array is returned when the field is missing. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return an array of two-field rows, one per map entry, or an empty array when absent + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -108,6 +162,13 @@ public Object transformFromParquet(SimpleGroup simpleGroup) { return new Row[0]; } + /** + * Deserializes a legacy-encoded Parquet map, where entries are repeated directly on the field. + * + * @param simpleGroup the Parquet group containing the map field + * @param fieldName the name of the map field to read + * @return the deserialized key/value rows + */ private Row[] transformLegacyMapFromSimpleGroup(SimpleGroup simpleGroup, String fieldName) { ArrayList deserializedRows = new ArrayList<>(); int repetitionCount = simpleGroup.getFieldRepetitionCount(fieldName); @@ -119,6 +180,14 @@ private Row[] transformLegacyMapFromSimpleGroup(SimpleGroup simpleGroup, String return deserializedRows.toArray(new Row[]{}); } + /** + * Deserializes a standard-encoded Parquet map, where entries are nested under a + * {@code key_value} group. + * + * @param simpleGroup the Parquet group containing the map field + * @param fieldName the name of the map field to read + * @return the deserialized key/value rows + */ private Row[] transformStandardMapFromSimpleGroup(SimpleGroup simpleGroup, String fieldName) { ArrayList deserializedRows = new ArrayList<>(); final String innerFieldName = "key_value"; @@ -132,11 +201,22 @@ private Row[] transformStandardMapFromSimpleGroup(SimpleGroup simpleGroup, Strin return deserializedRows.toArray(new Row[]{}); } + /** + * Returns {@code null}, as map fields are not serialized to JSON by this handler. + * + * @param field the value that would be serialized + * @return {@code null}, always + */ @Override public Object transformToJson(Object field) { return null; } + /** + * Returns the Flink {@code TypeInformation} used to represent this map field. + * + * @return an object-array type whose element is the key/value row type + */ @Override public TypeInformation getTypeInformation() { return Types.OBJECT_ARRAY(TypeInformationFactory.getRowType(fieldDescriptor.getMessageType())); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MessageHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MessageHandler.java index c7dcd12f9..0e5bd1909 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MessageHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/MessageHandler.java @@ -25,9 +25,21 @@ * The type Message proto handler. */ public class MessageHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} of the nested message field being converted. + */ private FieldDescriptor fieldDescriptor; + /** + * Lazily created schema used to serialize the message row to JSON. + */ private JsonRowSerializationSchema jsonRowSerializationSchema; + /** + * The default (empty) instance of the message, used when a Parquet value is absent. + */ private DynamicMessage defaultMessageInstance; + /** + * The descriptor of the nested message type, cached for deserialization. + */ private Descriptors.Descriptor fieldMessageDescriptor; /** @@ -43,11 +55,27 @@ public MessageHandler(FieldDescriptor fieldDescriptor) { } } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a message type other than {@code google.protobuf.Timestamp} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == MESSAGE && !fieldDescriptor.getMessageType().getFullName().equals("google.protobuf.Timestamp"); } + /** + * Builds the nested protobuf message from a Flink {@code Row} and sets it on the builder. + * + *

Each nested field present in the row is converted with its own handler before the + * assembled message is attached. When the handler cannot apply or {@code field} is + * {@code null}, the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the row holding the nested message's field values, or {@code null} to skip + * @return the same {@code builder}, with the nested message set when provided + */ @Override public Builder transformToProtoBuilder(Builder builder, Object field) { if (!canHandle() || field == null) { @@ -71,21 +99,48 @@ public Builder transformToProtoBuilder(Builder builder, Object field) { return builder.setField(fieldDescriptor, elementBuilder.build()); } + /** + * Converts a post-processor map into a Flink {@code Row} for the nested message. + * + * @param field the nested message values as a map keyed by field name + * @return the populated row representing the nested message + */ @Override public Object transformFromPostProcessor(Object field) { return RowFactory.createRow((Map) field, fieldDescriptor.getMessageType()); } + /** + * Converts a nested protobuf message read from the parent into a Flink {@code Row}. + * + * @param field the nested {@code DynamicMessage} read from the parent message + * @return the populated row representing the nested message + */ @Override public Object transformFromProto(Object field) { return RowFactory.createRow((DynamicMessage) field); } + /** + * Converts the nested protobuf message into a row using the descriptor cache. + * + * @param field the nested {@code DynamicMessage} read from the parent message + * @param cache the field descriptor cache used to resolve nested field indices + * @return the populated row representing the nested message + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return RowFactory.createRow((DynamicMessage) field, cache); } + /** + * Reads the nested message field from a Parquet {@code SimpleGroup} into a row. + * + *

When the field is missing, a row built from the message's default instance is returned. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the populated row, or a default row when the field is absent + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -96,6 +151,14 @@ public Object transformFromParquet(SimpleGroup simpleGroup) { return RowFactory.createRow(defaultMessageInstance); } + /** + * Serializes the nested message row to its JSON string representation. + * + *

The JSON serialization schema is created lazily on first use. + * + * @param field the nested message row to serialize + * @return the JSON string for the row + */ @Override public Object transformToJson(Object field) { if (jsonRowSerializationSchema == null) { @@ -104,11 +167,21 @@ public Object transformToJson(Object field) { return new String(jsonRowSerializationSchema.serialize((Row) field)); } + /** + * Returns the Flink {@code TypeInformation} used to represent this nested message. + * + * @return the row type derived from the message descriptor + */ @Override public TypeInformation getTypeInformation() { return TypeInformationFactory.getRowType(fieldDescriptor.getMessageType()); } + /** + * Builds the JSON row serialization schema for the nested message type. + * + * @return a schema configured with this message's row type information + */ private JsonRowSerializationSchema createJsonRowSchema() { return JsonRowSerializationSchema .builder() diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/StructMessageHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/StructMessageHandler.java index bb3833acb..a0ded77cc 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/StructMessageHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/StructMessageHandler.java @@ -13,6 +13,9 @@ * The type Struct message proto handler. */ public class StructMessageHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} of the {@code google.protobuf.Struct} field handled here. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -24,42 +27,92 @@ public StructMessageHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a non-repeated {@code google.protobuf.Struct} message + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.MESSAGE && fieldDescriptor.toProto().getTypeName().equals(".google.protobuf.Struct") && !fieldDescriptor.isRepeated(); } + /** + * Returns the builder unchanged. + * + *

This handler does not currently serialize {@code Struct} values back into protobuf. + * + * @param builder the dynamic message builder being populated + * @param field the value that would be set, ignored here + * @return the supplied {@code builder}, unchanged + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { return builder; } + /** + * Returns {@code null}, as {@code Struct} values are not produced from post-processor input. + * + * @param field the value supplied by an upstream post processor + * @return {@code null}, always + */ @Override public Object transformFromPostProcessor(Object field) { return null; } + /** + * Returns {@code null}, as {@code Struct} values are not read from protobuf by this handler. + * + * @param field the value read from the protobuf message + * @return {@code null}, always + */ @Override public Object transformFromProto(Object field) { return null; } + /** + * Returns {@code null}, ignoring the descriptor cache. + * + * @param field the value read from the protobuf message + * @param cache the field descriptor cache, unused here + * @return {@code null}, always + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return null; } + /** + * Returns {@code null}, as {@code Struct} values are not read from Parquet by this handler. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return {@code null}, always + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { return null; } + /** + * Returns {@code null}, as {@code Struct} values are not emitted to JSON by this handler. + * + * @param field the value that would be serialized + * @return {@code null}, always + */ @Override public Object transformToJson(Object field) { return null; } + /** + * Returns the Flink {@code TypeInformation} used to represent this field. + * + * @return an empty named-row type + */ @Override public TypeInformation getTypeInformation() { return Types.ROW_NAMED(new String[]{}); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/TimestampHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/TimestampHandler.java index 32f659452..e39ccbed0 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/TimestampHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/complex/TimestampHandler.java @@ -27,11 +27,29 @@ * The type Timestamp proto handler. */ public class TimestampHandler implements TypeHandler { + /** + * The number of milliseconds in one second, used to split epoch values into seconds. + */ private static final int SECOND_TO_MS_FACTOR = 1000; + /** + * The default seconds component used when a timestamp value is absent. + */ private static final long DEFAULT_SECONDS_VALUE = 0L; + /** + * The default nanoseconds component used when a timestamp value is absent. + */ private static final int DEFAULT_NANOS_VALUE = 0; + /** + * The number of nanoseconds in one millisecond, used when converting Parquet millis. + */ private static final int MS_TO_NANOS_FACTOR = 1000_000; + /** + * The UTC date format used to render timestamps as strings for JSON output. + */ private static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + /** + * The protobuf {@code FieldDescriptor} of the {@code google.protobuf.Timestamp} field handled here. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -44,11 +62,29 @@ public TimestampHandler(Descriptors.FieldDescriptor fieldDescriptor) { dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a {@code google.protobuf.Timestamp} message + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.MESSAGE && fieldDescriptor.getMessageType().getFullName().equals("google.protobuf.Timestamp"); } + /** + * Converts a variety of time representations into a protobuf {@code Timestamp} and sets it. + * + *

Supported inputs include {@code java.sql.Timestamp}, {@code Instant}, + * {@code LocalDateTime}, a two-field {@code Row} of {@code (seconds, nanos)}, an ISO-8601 + * {@code String}, and any {@code Number} of epoch seconds. When the handler cannot apply or + * {@code field} is {@code null}, the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the time value to convert and set, or {@code null} to skip + * @return the same {@code builder}, with the timestamp set when a value could be derived + * @throws IllegalArgumentException if a {@code Row} input does not have exactly two fields + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { if (!canHandle() || field == null) { @@ -93,27 +129,62 @@ public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder bui return builder; } + /** + * Converts a {@code LocalDateTime} into a protobuf {@code Timestamp} at UTC. + * + * @param timeField the local date-time to convert, interpreted as UTC + * @return the equivalent protobuf timestamp + */ private Timestamp convertLocalDateTime(LocalDateTime timeField) { return Timestamp.newBuilder() .setSeconds(timeField.toEpochSecond(ZoneOffset.UTC)) .build(); } + /** + * Converts a post-processor value into its ISO-8601 string form when it is a valid instant. + * + * @param field the value emitted by an upstream post processor + * @return the value's string representation, or {@code null} if it is not a valid timestamp + */ @Override public Object transformFromPostProcessor(Object field) { return isValid(field) ? field.toString() : null; } + /** + * Converts a protobuf {@code Timestamp} message read from the parent into a Flink {@code Row}. + * + * @param field the nested {@code DynamicMessage} timestamp read from the parent message + * @return a row holding the timestamp's {@code seconds} and {@code nanos} fields + */ @Override public Object transformFromProto(Object field) { return RowFactory.createRow((DynamicMessage) field); } + /** + * Converts the protobuf timestamp into a row using the descriptor cache. + * + * @param field the nested {@code DynamicMessage} timestamp read from the parent message + * @param cache the field descriptor cache used to resolve nested field indices + * @return a row holding the timestamp's {@code seconds} and {@code nanos} fields + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return RowFactory.createRow((DynamicMessage) field, cache); } + /** + * Reads the timestamp field from a Parquet {@code SimpleGroup} into a {@code (seconds, nanos)} row. + * + *

Both the {@code INT64} millisecond encoding and the nested group encoding (with + * {@code seconds} and {@code nanos} fields) are supported; a default zero timestamp is + * returned when the field is absent. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return a two-field row of seconds and nanos + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -128,6 +199,13 @@ public Object transformFromParquet(SimpleGroup simpleGroup) { return Row.of(DEFAULT_SECONDS_VALUE, DEFAULT_NANOS_VALUE); } + /** + * Parses an {@code INT64} millisecond timestamp from a Parquet group into seconds and nanos. + * + * @param simpleGroup the Parquet group containing the timestamp field + * @param timestampFieldName the name of the timestamp field to read + * @return a two-field row of seconds and nanos + */ private Row parseInt64TimestampFromSimpleGroup(SimpleGroup simpleGroup, String timestampFieldName) { /* conversion from ms to nanos borrowed from Instant.java class and inlined here for performance reasons */ long timeInMillis = simpleGroup.getLong(timestampFieldName, 0); @@ -137,6 +215,13 @@ private Row parseInt64TimestampFromSimpleGroup(SimpleGroup simpleGroup, String t return Row.of(seconds, nanos); } + /** + * Parses a nested-group timestamp (with {@code seconds} and {@code nanos}) from a Parquet group. + * + * @param simpleGroup the Parquet group containing the timestamp field + * @param timestampFieldName the name of the timestamp group field to read + * @return a two-field row of seconds and nanos, defaulting to zero for missing components + */ private Row parseGroupTypeTimestampFromSimpleGroup(SimpleGroup simpleGroup, String timestampFieldName) { SimpleGroup timestampGroup = (SimpleGroup) simpleGroup.getGroup(timestampFieldName, 0); long seconds = 0L; @@ -150,6 +235,13 @@ private Row parseGroupTypeTimestampFromSimpleGroup(SimpleGroup simpleGroup, Stri return Row.of(seconds, nanos); } + /** + * Renders the timestamp row as a UTC date-time string for JSON output. + * + * @param field the timestamp {@code Row} of {@code (seconds, nanos)} + * @return the formatted UTC date-time string, or the original value when it is not a + * two-field row + */ @Override public Object transformToJson(Object field) { Row timeField = (Row) field; @@ -161,11 +253,22 @@ public Object transformToJson(Object field) { } } + /** + * Returns the Flink {@code TypeInformation} used to represent this timestamp field. + * + * @return the row type derived from the timestamp message descriptor + */ @Override public TypeInformation getTypeInformation() { return TypeInformationFactory.getRowType(fieldDescriptor.getMessageType()); } + /** + * Converts a {@code java.sql.Timestamp} into a protobuf {@code Timestamp}. + * + * @param field the SQL timestamp to convert + * @return the equivalent protobuf timestamp, preserving seconds and nanoseconds + */ private Timestamp convertSqlTimestamp(java.sql.Timestamp field) { long timestampSeconds = field.getTime() / SECOND_TO_MS_FACTOR; int timestampNanos = field.getNanos(); @@ -175,6 +278,12 @@ private Timestamp convertSqlTimestamp(java.sql.Timestamp field) { .build(); } + /** + * Checks whether the given value can be parsed as an ISO-8601 instant. + * + * @param field the value to validate + * @return {@code true} if the value is non-null and parses as an {@code Instant} + */ private boolean isValid(Object field) { if (field == null) { return false; diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/BooleanHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/BooleanHandler.java index 46917901f..8c9c9df97 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/BooleanHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/BooleanHandler.java @@ -14,6 +14,9 @@ * The type Boolean primitive type handler. */ public class BooleanHandler implements PrimitiveHandler { + /** + * The protobuf {@code FieldDescriptor} of the boolean field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -25,16 +28,33 @@ public BooleanHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field's Java type is {@code BOOLEAN} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == JavaType.BOOLEAN; } + /** + * Parses the given value into a Java {@code boolean}. + * + * @param field the value to parse, defaulting to {@code false} when {@code null} + * @return the parsed boolean value + */ @Override public Object parseObject(Object field) { return Boolean.parseBoolean(getValueOrDefault(field, "false")); } + /** + * Reads the boolean value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the boolean value, or {@code false} when the field is absent + */ @Override public Object parseSimpleGroup(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -48,6 +68,12 @@ public Object parseSimpleGroup(SimpleGroup simpleGroup) { } } + /** + * Converts a list of boolean values into a primitive {@code boolean[]}. + * + * @param field the list of boolean values, or {@code null} + * @return the values as a {@code boolean[]}, empty when {@code field} is {@code null} + */ @Override public Object parseRepeatedObjectField(Object field) { boolean[] inputValues = new boolean[0]; @@ -57,6 +83,12 @@ public Object parseRepeatedObjectField(Object field) { return inputValues; } + /** + * Reads the repeated boolean field from a Parquet {@code SimpleGroup} into a {@code boolean[]}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the boolean array, empty when the field is absent + */ @Override public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -71,11 +103,21 @@ public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { return new boolean[0]; } + /** + * Returns the Flink {@code TypeInformation} for a single boolean value. + * + * @return {@code Types.BOOLEAN} + */ @Override public TypeInformation getTypeInformation() { return Types.BOOLEAN; } + /** + * Returns the Flink {@code TypeInformation} for a repeated boolean field. + * + * @return a primitive boolean-array type + */ @Override public TypeInformation getArrayType() { return Types.PRIMITIVE_ARRAY(Types.BOOLEAN); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/ByteStringHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/ByteStringHandler.java index 146ace65f..18ac1b4dc 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/ByteStringHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/ByteStringHandler.java @@ -16,6 +16,9 @@ * The type Byte string primitive type handler. */ public class ByteStringHandler implements PrimitiveHandler { + /** + * The protobuf {@code FieldDescriptor} of the byte-string field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -27,16 +30,33 @@ public ByteStringHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field's Java type is {@code BYTE_STRING} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == JavaType.BYTE_STRING; } + /** + * Returns the given byte-string value unchanged. + * + * @param field the value to pass through + * @return the same {@code field} value + */ @Override public Object parseObject(Object field) { return field; } + /** + * Reads the byte-string value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the value as a {@code ByteString}, or {@code null} when the field is absent + */ @Override public Object parseSimpleGroup(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -50,6 +70,12 @@ public Object parseSimpleGroup(SimpleGroup simpleGroup) { } } + /** + * Converts a list of byte-string values into a {@code ByteString[]}. + * + * @param field the list of {@code ByteString} values, or {@code null} + * @return the values as a {@code ByteString[]}, empty when {@code field} is {@code null} + */ @Override public Object parseRepeatedObjectField(Object field) { List inputValues = new ArrayList<>(); @@ -59,6 +85,12 @@ public Object parseRepeatedObjectField(Object field) { return inputValues.toArray(new ByteString[]{}); } + /** + * Reads the repeated byte-string field from a Parquet {@code SimpleGroup} into a {@code ByteString[]}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the byte-string array, empty when the field is absent + */ @Override public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -73,11 +105,21 @@ public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { return byteStringList.toArray(new ByteString[]{}); } + /** + * Returns the Flink {@code TypeInformation} for a single byte-string value. + * + * @return the type information for {@code ByteString} + */ @Override public TypeInformation getTypeInformation() { return TypeInformation.of(ByteString.class); } + /** + * Returns the Flink {@code TypeInformation} for a repeated byte-string field. + * + * @return an object-array type of {@code ByteString} + */ @Override public TypeInformation getArrayType() { return Types.OBJECT_ARRAY(TypeInformation.of(ByteString.class)); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/DoubleHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/DoubleHandler.java index e326c677d..fba790802 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/DoubleHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/DoubleHandler.java @@ -14,6 +14,9 @@ * The type Double primitive type handler. */ public class DoubleHandler implements PrimitiveHandler { + /** + * The protobuf {@code FieldDescriptor} of the double field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -25,16 +28,33 @@ public DoubleHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field's Java type is {@code DOUBLE} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == JavaType.DOUBLE; } + /** + * Parses the given value into a Java {@code double}. + * + * @param field the value to parse, defaulting to {@code 0} when {@code null} + * @return the parsed double value + */ @Override public Object parseObject(Object field) { return Double.parseDouble(getValueOrDefault(field, "0")); } + /** + * Reads the double value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the double value, or {@code 0.0} when the field is absent + */ @Override public Object parseSimpleGroup(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -48,6 +68,12 @@ public Object parseSimpleGroup(SimpleGroup simpleGroup) { } } + /** + * Converts a list of double values into a primitive {@code double[]}. + * + * @param field the list of double values, or {@code null} + * @return the values as a {@code double[]}, empty when {@code field} is {@code null} + */ @Override public Object parseRepeatedObjectField(Object field) { double[] inputValues = new double[0]; @@ -57,6 +83,12 @@ public Object parseRepeatedObjectField(Object field) { return inputValues; } + /** + * Reads the repeated double field from a Parquet {@code SimpleGroup} into a {@code double[]}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the double array, empty when the field is absent + */ @Override public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -71,11 +103,21 @@ public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { return new double[0]; } + /** + * Returns the Flink {@code TypeInformation} for a single double value. + * + * @return {@code Types.DOUBLE} + */ @Override public TypeInformation getTypeInformation() { return Types.DOUBLE; } + /** + * Returns the Flink {@code TypeInformation} for a repeated double field. + * + * @return a primitive double-array type + */ @Override public TypeInformation getArrayType() { return Types.PRIMITIVE_ARRAY(Types.DOUBLE); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/FloatHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/FloatHandler.java index 6083800d1..1f6b32f9f 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/FloatHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/FloatHandler.java @@ -14,6 +14,9 @@ * The type Float primitive type handler. */ public class FloatHandler implements PrimitiveHandler { + /** + * The protobuf {@code FieldDescriptor} of the float field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -25,16 +28,33 @@ public FloatHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field's Java type is {@code FLOAT} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == JavaType.FLOAT; } + /** + * Parses the given value into a Java {@code float}. + * + * @param field the value to parse, defaulting to {@code 0} when {@code null} + * @return the parsed float value + */ @Override public Object parseObject(Object field) { return Float.parseFloat(getValueOrDefault(field, "0")); } + /** + * Reads the float value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the float value, or {@code 0.0F} when the field is absent + */ @Override public Object parseSimpleGroup(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -48,6 +68,12 @@ public Object parseSimpleGroup(SimpleGroup simpleGroup) { } } + /** + * Converts a list of float values into a primitive {@code float[]}. + * + * @param field the list of float values, or {@code null} + * @return the values as a {@code float[]}, empty when {@code field} is {@code null} + */ @Override public Object parseRepeatedObjectField(Object field) { @@ -58,6 +84,12 @@ public Object parseRepeatedObjectField(Object field) { return inputValues; } + /** + * Reads the repeated float field from a Parquet {@code SimpleGroup} into a {@code float[]}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the float array, empty when the field is absent + */ @Override public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -72,11 +104,21 @@ public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { return new float[0]; } + /** + * Returns the Flink {@code TypeInformation} for a single float value. + * + * @return {@code Types.FLOAT} + */ @Override public TypeInformation getTypeInformation() { return Types.FLOAT; } + /** + * Returns the Flink {@code TypeInformation} for a repeated float field. + * + * @return a primitive float-array type + */ @Override public TypeInformation getArrayType() { return Types.PRIMITIVE_ARRAY(Types.FLOAT); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/IntegerHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/IntegerHandler.java index c181776dc..8923ed4e0 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/IntegerHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/IntegerHandler.java @@ -14,6 +14,9 @@ * The type Integer primitive type handler. */ public class IntegerHandler implements PrimitiveHandler { + /** + * The protobuf {@code FieldDescriptor} of the integer field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -25,16 +28,33 @@ public IntegerHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field's Java type is {@code INT} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == JavaType.INT; } + /** + * Parses the given value into a Java {@code int}. + * + * @param field the value to parse, defaulting to {@code 0} when {@code null} + * @return the parsed integer value + */ @Override public Object parseObject(Object field) { return Integer.parseInt(getValueOrDefault(field, "0")); } + /** + * Reads the integer value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the integer value, or {@code 0} when the field is absent + */ @Override public Object parseSimpleGroup(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -48,6 +68,12 @@ public Object parseSimpleGroup(SimpleGroup simpleGroup) { } } + /** + * Converts a list of integer values into a primitive {@code int[]}. + * + * @param field the list of integer values, or {@code null} + * @return the values as an {@code int[]}, empty when {@code field} is {@code null} + */ @Override public Object parseRepeatedObjectField(Object field) { int[] inputValues = new int[0]; @@ -57,6 +83,12 @@ public Object parseRepeatedObjectField(Object field) { return inputValues; } + /** + * Reads the repeated integer field from a Parquet {@code SimpleGroup} into an {@code int[]}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the integer array, empty when the field is absent + */ @Override public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -71,11 +103,21 @@ public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { return new int[0]; } + /** + * Returns the Flink {@code TypeInformation} for a single integer value. + * + * @return {@code Types.INT} + */ @Override public TypeInformation getTypeInformation() { return Types.INT; } + /** + * Returns the Flink {@code TypeInformation} for a repeated integer field. + * + * @return a primitive int-array type + */ @Override public TypeInformation getArrayType() { return Types.PRIMITIVE_ARRAY(Types.INT); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/LongHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/LongHandler.java index 9292f7cf0..c35271ee3 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/LongHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/LongHandler.java @@ -14,6 +14,9 @@ * The type Long primitive type handler. */ public class LongHandler implements PrimitiveHandler { + /** + * The protobuf {@code FieldDescriptor} of the long field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -25,16 +28,33 @@ public LongHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field's Java type is {@code LONG} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == JavaType.LONG; } + /** + * Parses the given value into a Java {@code long}. + * + * @param field the value to parse, defaulting to {@code 0} when {@code null} + * @return the parsed long value + */ @Override public Object parseObject(Object field) { return Long.parseLong(getValueOrDefault(field, "0")); } + /** + * Reads the long value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the long value, or {@code 0L} when the field is absent + */ @Override public Object parseSimpleGroup(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -48,6 +68,12 @@ public Object parseSimpleGroup(SimpleGroup simpleGroup) { } } + /** + * Converts a list of long values into a {@code Long[]}. + * + * @param field the list of long values, or {@code null} + * @return the values as a {@code Long[]}, empty when {@code field} is {@code null} + */ @Override public Object parseRepeatedObjectField(Object field) { List inputValues = new ArrayList<>(); @@ -57,6 +83,12 @@ public Object parseRepeatedObjectField(Object field) { return inputValues.toArray(new Long[]{}); } + /** + * Reads the repeated long field from a Parquet {@code SimpleGroup} into a {@code Long[]}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the long array, empty when the field is absent + */ @Override public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -70,11 +102,21 @@ public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { return longArrayList.toArray(new Long[]{}); } + /** + * Returns the Flink {@code TypeInformation} for a single long value. + * + * @return {@code Types.LONG} + */ @Override public TypeInformation getTypeInformation() { return Types.LONG; } + /** + * Returns the Flink {@code TypeInformation} for a repeated long field. + * + * @return an object-array type of {@code Long} + */ @Override public TypeInformation getArrayType() { return Types.OBJECT_ARRAY(Types.LONG); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/PrimitiveHandlerFactory.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/PrimitiveHandlerFactory.java index bd4d1fb40..41741b6ab 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/PrimitiveHandlerFactory.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/PrimitiveHandlerFactory.java @@ -26,6 +26,15 @@ public static PrimitiveHandler getTypeHandler(Descriptors.FieldDescriptor fieldD return filteredTypeHandlers.orElseThrow(() -> new DataTypeNotSupportedException("Data type " + fieldDescriptor.getJavaType() + " not supported in primitive type handlers")); } + /** + * Builds the ordered list of candidate primitive handlers for the given field. + * + *

{@code getTypeHandler} selects the first handler whose {@code canHandle()} returns + * {@code true}, so the order determines which handler claims each Java type. + * + * @param fieldDescriptor the field descriptor to build candidate handlers for + * @return the ordered list of candidate primitive handlers to try + */ private static List getSpecificHandlers(Descriptors.FieldDescriptor fieldDescriptor) { return Arrays.asList( new IntegerHandler(fieldDescriptor), diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/StringHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/StringHandler.java index 71258dbf6..c2172a022 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/StringHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/primitive/StringHandler.java @@ -15,6 +15,9 @@ * The type String primitive type handler. */ public class StringHandler implements PrimitiveHandler { + /** + * The protobuf {@code FieldDescriptor} of the string field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -26,16 +29,33 @@ public StringHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field's Java type is {@code STRING} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == JavaType.STRING; } + /** + * Converts the given value into its string form. + * + * @param field the value to convert, defaulting to an empty string when {@code null} + * @return the value's string representation + */ @Override public Object parseObject(Object field) { return getValueOrDefault(field, ""); } + /** + * Reads the string value for this field from a Parquet {@code SimpleGroup}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the string value, or an empty string when the field is absent + */ @Override public Object parseSimpleGroup(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -49,6 +69,12 @@ public Object parseSimpleGroup(SimpleGroup simpleGroup) { } } + /** + * Converts a list of string values into a {@code String[]}. + * + * @param field the list of string values, or {@code null} + * @return the values as a {@code String[]}, empty when {@code field} is {@code null} + */ @Override public Object parseRepeatedObjectField(Object field) { List inputValues = new ArrayList<>(); @@ -58,6 +84,12 @@ public Object parseRepeatedObjectField(Object field) { return inputValues.toArray(new String[]{}); } + /** + * Reads the repeated string field from a Parquet {@code SimpleGroup} into a {@code String[]}. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the string array, empty when the field is absent + */ @Override public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -72,11 +104,21 @@ public Object parseRepeatedSimpleGroupField(SimpleGroup simpleGroup) { return new String[0]; } + /** + * Returns the Flink {@code TypeInformation} for a single string value. + * + * @return {@code Types.STRING} + */ @Override public TypeInformation getTypeInformation() { return Types.STRING; } + /** + * Returns the Flink {@code TypeInformation} for a repeated string field. + * + * @return an object-array type of {@code String} + */ @Override public TypeInformation getArrayType() { return ObjectArrayTypeInfo.getInfoFor(Types.STRING); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedEnumHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedEnumHandler.java index a916d9b85..1ce7cfd0a 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedEnumHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedEnumHandler.java @@ -23,7 +23,13 @@ * The type Repeated enum proto handler. */ public class RepeatedEnumHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} of the repeated enum field this handler processes. + */ private Descriptors.FieldDescriptor fieldDescriptor; + /** + * Shared Gson instance used to serialize the enum-name array to JSON. + */ private static final Gson GSON = new Gson(); /** @@ -35,11 +41,27 @@ public RepeatedEnumHandler(Descriptors.FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a repeated protobuf {@code enum} + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == ENUM && fieldDescriptor.isRepeated(); } + /** + * Sets the repeated enum field on the builder by resolving each value's name to a constant. + * + *

The input may be an array or a {@code List} of enum constant names. When the handler + * cannot apply or {@code field} is {@code null}, the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the collection of enum constant names to set, or {@code null} to skip + * @return the same {@code builder}, with the repeated enum field set when provided + * @throws EnumFieldNotFoundException if any name does not match a constant of the enum + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { if (!canHandle() || field == null) { @@ -55,6 +77,13 @@ public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder bui return builder; } + /** + * Resolves a single value to its protobuf enum value descriptor by name. + * + * @param field the enum constant name to resolve + * @return the matching enum value descriptor + * @throws EnumFieldNotFoundException if the name does not match any constant of the enum + */ private Descriptors.EnumValueDescriptor getEnumValue(Object field) { String stringValue = String.valueOf(field).trim(); Descriptors.EnumValueDescriptor valueByName = fieldDescriptor.getEnumType().findValueByName(stringValue); @@ -64,21 +93,48 @@ private Descriptors.EnumValueDescriptor getEnumValue(Object field) { return valueByName; } + /** + * Converts a post-processor value into an array of enum constant names. + * + * @param field the collection of enum values emitted by an upstream post processor + * @return a {@code String[]} of enum constant names + */ @Override public Object transformFromPostProcessor(Object field) { return getValue(field); } + /** + * Converts the repeated enum values read from a protobuf message into an array of names. + * + * @param field the repeated enum value read from the message + * @return a {@code String[]} of enum constant names + */ @Override public Object transformFromProto(Object field) { return getValue(field); } + /** + * Converts the repeated protobuf enum values into an array of names, ignoring the cache. + * + * @param field the repeated enum value read from the message + * @param cache the field descriptor cache, unused for enum fields + * @return a {@code String[]} of enum constant names + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return getValue(field); } + /** + * Reads the repeated enum field from a Parquet {@code SimpleGroup} into an array of names. + * + *

Unknown values fall back to the enum's zero-numbered default constant. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return a {@code String[]} of enum constant names, empty when the field is absent + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { String defaultEnumValue = fieldDescriptor.getEnumType().findValueByNumber(0).getName(); @@ -96,16 +152,33 @@ public Object transformFromParquet(SimpleGroup simpleGroup) { return enumArrayList.toArray(new String[]{}); } + /** + * Serializes the repeated enum values to a JSON array of constant names. + * + * @param field the repeated enum value to serialize + * @return the JSON string for the array of enum names + */ @Override public Object transformToJson(Object field) { return GSON.toJson(getValue(field)); } + /** + * Returns the Flink {@code TypeInformation} used to represent this repeated enum field. + * + * @return an object-array type of {@code String} + */ @Override public TypeInformation getTypeInformation() { return ObjectArrayTypeInfo.getInfoFor(Types.STRING); } + /** + * Converts a collection of enum values into an array of their string names. + * + * @param field the collection of enum values, or {@code null} + * @return a {@code String[]} of names, empty when {@code field} is {@code null} + */ private Object getValue(Object field) { List values = new ArrayList<>(); if (field != null) { @@ -114,6 +187,12 @@ private Object getValue(Object field) { return values.toArray(new String[]{}); } + /** + * Maps each element of the given list to its string representation. + * + * @param protos the list of enum values to stringify + * @return the list of string names + */ private List getStringRow(List protos) { return protos .stream() diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedMessageHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedMessageHandler.java index 6adbb712a..05979cf99 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedMessageHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedMessageHandler.java @@ -29,8 +29,17 @@ * The type Repeated message proto handler. */ public class RepeatedMessageHandler implements TypeHandler { + /** + * Lazily created schema used to serialize each message row to JSON. + */ private JsonRowSerializationSchema jsonRowSerializationSchema; + /** + * The protobuf {@code FieldDescriptor} of the repeated message field being converted. + */ private FieldDescriptor fieldDescriptor; + /** + * The descriptor of the repeated message's element type, cached for deserialization. + */ private Descriptors.Descriptor fieldMessageDescriptor; /** @@ -45,11 +54,27 @@ public RepeatedMessageHandler(FieldDescriptor fieldDescriptor) { } } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a repeated message type + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == MESSAGE && fieldDescriptor.isRepeated(); } + /** + * Builds the repeated protobuf messages from Flink rows and sets them on the builder. + * + *

The input may be an {@code ArrayList} or an array of {@code Row}s; each row is converted + * into a nested {@code DynamicMessage}. When the handler cannot apply or {@code field} is + * {@code null}, the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the rows representing the repeated messages, or {@code null} to skip + * @return the same {@code builder}, with the repeated messages set when provided + */ @Override public Builder transformToProtoBuilder(Builder builder, Object field) { if (!canHandle() || field == null) { @@ -73,6 +98,12 @@ public Builder transformToProtoBuilder(Builder builder, Object field) { return builder.setField(fieldDescriptor, messages); } + /** + * Converts a post-processor JSON array into an array of message {@code Row}s. + * + * @param field the {@code JSONArray} of nested message values, or {@code null} + * @return an array of rows, one per nested message + */ @Override public Object transformFromPostProcessor(Object field) { ArrayList rows = new ArrayList<>(); @@ -85,6 +116,12 @@ public Object transformFromPostProcessor(Object field) { return rows.toArray(); } + /** + * Converts the repeated messages read from a protobuf message into an array of rows. + * + * @param field the list of nested {@code DynamicMessage}s read from the parent, or {@code null} + * @return an array of rows, one per nested message + */ @Override public Object transformFromProto(Object field) { ArrayList rows = new ArrayList<>(); @@ -95,6 +132,13 @@ public Object transformFromProto(Object field) { return rows.toArray(); } + /** + * Converts the repeated protobuf messages into rows using the descriptor cache. + * + * @param field the list of nested {@code DynamicMessage}s read from the parent, or {@code null} + * @param cache the field descriptor cache used to resolve nested field indices + * @return an array of rows, one per nested message + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { ArrayList rows = new ArrayList<>(); @@ -105,6 +149,12 @@ public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache ca return rows.toArray(); } + /** + * Reads the repeated message field from a Parquet {@code SimpleGroup} into an array of rows. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return an array of rows, one per nested message, empty when the field is absent + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { String fieldName = fieldDescriptor.getName(); @@ -119,6 +169,14 @@ public Object transformFromParquet(SimpleGroup simpleGroup) { return rowList.toArray(new Row[]{}); } + /** + * Serializes the repeated message rows into a string of JSON objects. + * + *

The JSON serialization schema is created lazily on first use. + * + * @param field the array of message rows to serialize + * @return a string representation of the serialized JSON objects + */ @Override public Object transformToJson(Object field) { if (jsonRowSerializationSchema == null) { @@ -129,17 +187,36 @@ public Object transformToJson(Object field) { .toArray(String[]::new)); } + /** + * Returns the Flink {@code TypeInformation} used to represent this repeated message field. + * + * @return an object-array type whose element is the nested message's row type + */ @Override public TypeInformation getTypeInformation() { return Types.OBJECT_ARRAY(TypeInformationFactory.getRowType(fieldDescriptor.getMessageType())); } + /** + * Builds a single nested {@code DynamicMessage} from a Flink {@code Row}. + * + * @param nestedFieldDescriptors the descriptors of the nested message's fields + * @param row the row holding the nested message's field values + * @return the assembled nested message + */ private DynamicMessage getNestedDynamicMessage(List nestedFieldDescriptors, Row row) { Builder elementBuilder = DynamicMessage.newBuilder(fieldDescriptor.getMessageType()); handleNestedField(elementBuilder, nestedFieldDescriptors, row); return elementBuilder.build(); } + /** + * Populates the element builder from a row by converting each present nested field. + * + * @param elementBuilder the builder for the nested message being assembled + * @param nestedFieldDescriptors the descriptors of the nested message's fields + * @param row the row holding the nested message's field values + */ private void handleNestedField(Builder elementBuilder, List nestedFieldDescriptors, Row row) { for (FieldDescriptor nestedFieldDescriptor : nestedFieldDescriptors) { int index = nestedFieldDescriptor.getIndex(); @@ -151,6 +228,11 @@ private void handleNestedField(Builder elementBuilder, List nes } } + /** + * Builds the JSON row serialization schema for the nested message type. + * + * @return a schema configured with the nested message's row type information + */ private JsonRowSerializationSchema createJsonRowSchema() { return JsonRowSerializationSchema .builder() diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedPrimitiveHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedPrimitiveHandler.java index 0c194bae9..b4854186c 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedPrimitiveHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedPrimitiveHandler.java @@ -23,7 +23,13 @@ * The type Repeated primitive proto handler. */ public class RepeatedPrimitiveHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} of the repeated primitive field being converted. + */ private final FieldDescriptor fieldDescriptor; + /** + * Shared Gson instance used to serialize the primitive array to JSON. + */ private static final Gson GSON = new Gson(); /** @@ -35,11 +41,26 @@ public RepeatedPrimitiveHandler(FieldDescriptor fieldDescriptor) { this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is repeated and neither a message nor an enum + */ @Override public boolean canHandle() { return fieldDescriptor.isRepeated() && fieldDescriptor.getJavaType() != MESSAGE && fieldDescriptor.getJavaType() != ENUM; } + /** + * Sets the repeated primitive field on the builder from a list or array of values. + * + *

An array input is first wrapped in a {@code List}. When the handler cannot apply or + * {@code field} is {@code null}, the builder is returned unchanged. + * + * @param builder the dynamic message builder being populated + * @param field the collection of primitive values to set, or {@code null} to skip + * @return the same {@code builder}, with the repeated field set when provided + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { if (!canHandle() || field == null) { @@ -51,6 +72,12 @@ public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder bui return builder.setField(fieldDescriptor, field); } + /** + * Converts a post-processor list of values into a list of parsed primitives. + * + * @param field the list of raw values emitted by an upstream post processor, or {@code null} + * @return a list of parsed primitive values, empty when {@code field} is {@code null} + */ @Override public Object transformFromPostProcessor(Object field) { ArrayList outputValues = new ArrayList<>(); @@ -64,29 +91,59 @@ public Object transformFromPostProcessor(Object field) { return outputValues; } + /** + * Converts the repeated primitive values read from a protobuf message into a primitive array. + * + * @param field the repeated primitive value read from the message + * @return the values as a primitive array of the field's type + */ @Override public Object transformFromProto(Object field) { PrimitiveHandler primitiveHandler = PrimitiveHandlerFactory.getTypeHandler(fieldDescriptor); return primitiveHandler.parseRepeatedObjectField(field); } + /** + * Converts the repeated protobuf primitives into a primitive array, ignoring the cache. + * + * @param field the repeated primitive value read from the message + * @param cache the field descriptor cache, unused for primitive fields + * @return the values as a primitive array of the field's type + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { PrimitiveHandler primitiveHandler = PrimitiveHandlerFactory.getTypeHandler(fieldDescriptor); return primitiveHandler.parseRepeatedObjectField(field); } + /** + * Reads the repeated primitive field from a Parquet {@code SimpleGroup} into a primitive array. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return the values as a primitive array of the field's type + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { PrimitiveHandler primitiveHandler = PrimitiveHandlerFactory.getTypeHandler(fieldDescriptor); return primitiveHandler.parseRepeatedSimpleGroupField(simpleGroup); } + /** + * Serializes the repeated primitive values to a JSON array. + * + * @param field the primitive array to serialize + * @return the JSON string for the array + */ @Override public Object transformToJson(Object field) { return GSON.toJson(field); } + /** + * Returns the Flink {@code TypeInformation} used to represent this repeated primitive field. + * + * @return the array type supplied by the matching {@code PrimitiveHandler} + */ @Override public TypeInformation getTypeInformation() { return PrimitiveHandlerFactory.getTypeHandler(fieldDescriptor).getArrayType(); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedStructMessageHandler.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedStructMessageHandler.java index 4c35de1f6..6d07dac11 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedStructMessageHandler.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/serde/typehandler/repeated/RepeatedStructMessageHandler.java @@ -13,6 +13,9 @@ * The type Repeated struct message proto handler. */ public class RepeatedStructMessageHandler implements TypeHandler { + /** + * The protobuf {@code FieldDescriptor} of the repeated {@code google.protobuf.Struct} field handled here. + */ private Descriptors.FieldDescriptor fieldDescriptor; /** @@ -24,42 +27,92 @@ public RepeatedStructMessageHandler(Descriptors.FieldDescriptor fieldDescriptor) this.fieldDescriptor = fieldDescriptor; } + /** + * Determines whether this handler applies to the field. + * + * @return {@code true} if the field is a repeated {@code google.protobuf.Struct} message + */ @Override public boolean canHandle() { return fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.MESSAGE && fieldDescriptor.toProto().getTypeName().equals(".google.protobuf.Struct") && fieldDescriptor.isRepeated(); } + /** + * Returns the builder unchanged. + * + *

This handler does not currently serialize repeated {@code Struct} values into protobuf. + * + * @param builder the dynamic message builder being populated + * @param field the value that would be set, ignored here + * @return the supplied {@code builder}, unchanged + */ @Override public DynamicMessage.Builder transformToProtoBuilder(DynamicMessage.Builder builder, Object field) { return builder; } + /** + * Returns {@code null}, as repeated {@code Struct} values are not produced from post-processor input. + * + * @param field the value supplied by an upstream post processor + * @return {@code null}, always + */ @Override public Object transformFromPostProcessor(Object field) { return null; } + /** + * Returns {@code null}, as repeated {@code Struct} values are not read from protobuf by this handler. + * + * @param field the value read from the protobuf message + * @return {@code null}, always + */ @Override public Object transformFromProto(Object field) { return null; } + /** + * Returns {@code null}, ignoring the descriptor cache. + * + * @param field the value read from the protobuf message + * @param cache the field descriptor cache, unused here + * @return {@code null}, always + */ @Override public Object transformFromProtoUsingCache(Object field, FieldDescriptorCache cache) { return null; } + /** + * Returns {@code null}, as repeated {@code Struct} values are not read from Parquet by this handler. + * + * @param simpleGroup the Parquet group holding the encoded record + * @return {@code null}, always + */ @Override public Object transformFromParquet(SimpleGroup simpleGroup) { return null; } + /** + * Returns {@code null}, as repeated {@code Struct} values are not emitted to JSON by this handler. + * + * @param field the value that would be serialized + * @return {@code null}, always + */ @Override public Object transformToJson(Object field) { return null; } + /** + * Returns the Flink {@code TypeInformation} used to represent this field. + * + * @return an object-array type of empty named rows + */ @Override public TypeInformation getTypeInformation() { return Types.OBJECT_ARRAY(Types.ROW_NAMED(new String[]{})); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/AggregateUdf.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/AggregateUdf.java index 2b9938027..d93021b82 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/AggregateUdf.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/AggregateUdf.java @@ -17,8 +17,20 @@ */ public abstract class AggregateUdf extends AggregateFunction { + /** Telemetry helper used to register the UDF usage gauge against the Flink metric group. */ private GaugeStatsManager gaugeStatsManager; + /** + * Initializes the aggregate function and attempts to register its UDF telemetry gauge. + * + *

Delegates to the superclass {@code open}, then builds a {@link GaugeStatsManager} from + * the {@link FunctionContext} metric group and registers an integer gauge keyed by + * {@code UDF_TELEMETRY_GROUP_KEY}. As noted on this class, the gauge is not actually published + * for aggregate functions due to the referenced Flink issue. + * + * @param context the Flink function context exposing the runtime metric group + * @throws Exception if the superclass {@code open} call fails + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/ScalarUdf.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/ScalarUdf.java index 9256d482c..74c9489d7 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/ScalarUdf.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/ScalarUdf.java @@ -12,8 +12,19 @@ */ public abstract class ScalarUdf extends ScalarFunction { + /** Telemetry helper used to register the UDF usage gauge against the Flink metric group. */ private GaugeStatsManager gaugeStatsManager; + /** + * Initializes this scalar function and registers its UDF telemetry gauge. + * + *

Delegates to the superclass {@code open}, then builds a {@link GaugeStatsManager} from + * the {@link FunctionContext} metric group and registers an integer gauge keyed by + * {@code UDF_TELEMETRY_GROUP_KEY} so that usage of this UDF is observable in metrics. + * + * @param context the Flink function context exposing the runtime metric group + * @throws Exception if the superclass {@code open} call fails + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/TableUdf.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/TableUdf.java index 43e5770b8..a53a72bb7 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/TableUdf.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/TableUdf.java @@ -14,8 +14,19 @@ */ public abstract class TableUdf extends TableFunction { + /** Telemetry helper used to register the UDF usage gauge against the Flink metric group. */ private GaugeStatsManager gaugeStatsManager; + /** + * Initializes this table function and registers its UDF telemetry gauge. + * + *

Delegates to the superclass {@code open}, then builds a {@link GaugeStatsManager} from + * the {@link FunctionContext} metric group and registers an integer gauge keyed by + * {@code UDF_TELEMETRY_GROUP_KEY} so that usage of this UDF is observable in metrics. + * + * @param context the Flink function context exposing the runtime metric group + * @throws Exception if the superclass {@code open} call fails + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/UdfFactory.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/UdfFactory.java index bed3085bb..0cdc4f09e 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/UdfFactory.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/udfs/UdfFactory.java @@ -9,7 +9,9 @@ * The Udf factory for scalar functions, table functions, and aggregate functions. */ public abstract class UdfFactory { + /** The Flink table environment into which the UDFs are registered as temporary functions. */ private final StreamTableEnvironment streamTableEnvironment; + /** The Dagger configuration made available to concrete factory implementations. */ private final Configuration configuration; /** diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/LastColumnWatermark.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/LastColumnWatermark.java index 8ee9171e4..e913aacf5 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/LastColumnWatermark.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/LastColumnWatermark.java @@ -7,8 +7,27 @@ import java.sql.Timestamp; import java.time.Duration; +/** + * {@link WatermarkStrategyDefinition} that derives event time from the last column of each row. + * + *

It builds a bounded-out-of-orderness strategy whose timestamp assigner reads the final field of + * each {@link Row} (index {@code arity - 1}), expects it to be a {@link Timestamp}, and uses its + * epoch-millis value as the event-time timestamp. This is appropriate when the stream's event-time + * column is known to be appended as the last column of the schema. + */ public class LastColumnWatermark implements WatermarkStrategyDefinition { + /** + * {@inheritDoc} + * + *

Returns a bounded-out-of-orderness strategy that extracts the event-time timestamp from the + * last field of each {@link Row}, casting it to {@link Timestamp} and reading its epoch-millis + * value. + * + * @param waterMarkDelayInMs the allowed out-of-orderness, in milliseconds, applied to the + * generated watermarks + * @return a watermark strategy keyed off the last column's timestamp + */ @Override public WatermarkStrategy getWatermarkStrategy(long waterMarkDelayInMs) { return WatermarkStrategy. diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/NoWatermark.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/NoWatermark.java index afa6d606e..0c692804f 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/NoWatermark.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/NoWatermark.java @@ -3,7 +3,23 @@ import org.apache.flink.api.common.eventtime.WatermarkStrategy; import org.apache.flink.types.Row; +/** + * {@link WatermarkStrategyDefinition} that disables watermarking entirely. + * + *

It returns Flink's {@link WatermarkStrategy#noWatermarks()} strategy, so no event-time + * watermarks are generated for the {@link Row} stream. This is appropriate for pipelines that do not + * rely on event-time progress, such as purely processing-time jobs. + */ public class NoWatermark implements WatermarkStrategyDefinition { + /** + * {@inheritDoc} + * + *

Always returns {@link WatermarkStrategy#noWatermarks()}; the {@code waterMarkDelayInMs} + * argument is ignored because no watermarks are emitted. + * + * @param waterMarkDelayInMs ignored, since this strategy emits no watermarks + * @return a no-op watermark strategy that never advances event time + */ @Override public WatermarkStrategy getWatermarkStrategy(long waterMarkDelayInMs) { return WatermarkStrategy.noWatermarks(); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/RowtimeFieldWatermark.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/RowtimeFieldWatermark.java index a4d5b558a..3e2a18840 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/RowtimeFieldWatermark.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/RowtimeFieldWatermark.java @@ -8,14 +8,41 @@ import java.time.Duration; import java.util.Arrays; +/** + * {@link WatermarkStrategyDefinition} that derives event time from a column named {@code rowtime}. + * + *

Given the stream's ordered column names, it locates the position of the {@code rowtime} column + * and builds a bounded-out-of-orderness strategy whose timestamp assigner reads that field from each + * {@link Row}, expects a {@link Timestamp}, and uses its epoch-millis value as the event-time + * timestamp. Use this when the event-time column is identified by name rather than by position. + */ public class RowtimeFieldWatermark implements WatermarkStrategyDefinition { + /** Conventional name of the event-time column this strategy looks up. */ private static final String ROWTIME = "rowtime"; + /** Ordered stream column names, used to resolve the index of the {@code rowtime} column. */ private final String[] columnNames; + /** + * Creates a strategy definition that reads event time from the {@code rowtime} column. + * + * @param columnNames the ordered column names of the stream, used to resolve the index of the + * {@code rowtime} column + */ public RowtimeFieldWatermark(String[] columnNames) { this.columnNames = columnNames; } + /** + * {@inheritDoc} + * + *

Returns a bounded-out-of-orderness strategy that extracts the event-time timestamp from the + * field at the index of the {@code rowtime} column (resolved from the configured column names), + * casting it to {@link Timestamp} and reading its epoch-millis value. + * + * @param waterMarkDelayInMs the allowed out-of-orderness, in milliseconds, applied to the + * generated watermarks + * @return a watermark strategy keyed off the {@code rowtime} column's timestamp + */ @Override public WatermarkStrategy getWatermarkStrategy(long waterMarkDelayInMs) { return WatermarkStrategy. diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/StreamWatermarkAssigner.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/StreamWatermarkAssigner.java index feac54918..adaeb8ee9 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/StreamWatermarkAssigner.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/StreamWatermarkAssigner.java @@ -5,18 +5,59 @@ import java.io.Serializable; +/** + * Applies a {@link WatermarkStrategyDefinition} to a Flink {@link DataStream} of {@link Row} + * records. + * + *

This is the entry point Dagger uses to attach event-time timestamps and watermarks to a source + * stream. It wraps a configured strategy definition and exposes overloads that either always assign + * watermarks or skip assignment when per-partition watermarking is enabled (in which case + * watermarks are expected to be assigned closer to the source instead). It is {@link Serializable} + * so it can be embedded in the Flink job graph. + */ public class StreamWatermarkAssigner implements Serializable { + /** Strategy used to build the Flink watermark strategy applied to the stream. */ private WatermarkStrategyDefinition watermarkStrategyDefinition; + /** + * Creates an assigner backed by the given watermark strategy definition. + * + * @param watermarkStrategyDefinition the strategy used to build the {@code WatermarkStrategy} + * applied to the stream + */ public StreamWatermarkAssigner(WatermarkStrategyDefinition watermarkStrategyDefinition) { this.watermarkStrategyDefinition = watermarkStrategyDefinition; } + /** + * Assigns timestamps and watermarks to the stream unless per-partition watermarking is enabled. + * + *

When {@code enablePerPartitionWatermark} is {@code false}, the configured strategy is + * applied to the stream via {@code assignTimestampsAndWatermarks}. When it is {@code true}, the + * input stream is returned unchanged, on the assumption that watermarks are assigned per + * partition closer to the source instead. + * + * @param inputStream the source stream of {@link Row} records + * @param watermarkDelayMs the allowed out-of-orderness, in milliseconds, passed to the + * strategy definition + * @param enablePerPartitionWatermark when {@code true} skip assignment here and return the input + * unchanged; when {@code false} assign watermarks now + * @return the stream with watermarks assigned, or the unchanged input stream when per-partition + * watermarking is enabled + */ public DataStream assignTimeStampAndWatermark(DataStream inputStream, long watermarkDelayMs, boolean enablePerPartitionWatermark) { return !enablePerPartitionWatermark ? inputStream .assignTimestampsAndWatermarks(watermarkStrategyDefinition.getWatermarkStrategy(watermarkDelayMs)) : inputStream; } + /** + * Assigns timestamps and watermarks to the stream using the configured strategy. + * + * @param inputStream the source stream of {@link Row} records + * @param watermarkDelayMs the allowed out-of-orderness, in milliseconds, passed to the strategy + * definition + * @return the stream with event-time timestamps and watermarks assigned + */ public DataStream assignTimeStampAndWatermark(DataStream inputStream, long watermarkDelayMs) { return inputStream .assignTimestampsAndWatermarks(watermarkStrategyDefinition.getWatermarkStrategy(watermarkDelayMs)); diff --git a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/WatermarkStrategyDefinition.java b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/WatermarkStrategyDefinition.java index 1e19c8658..b6ce3d41f 100644 --- a/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/WatermarkStrategyDefinition.java +++ b/dagger-common/src/main/java/com/gotocompany/dagger/common/watermark/WatermarkStrategyDefinition.java @@ -5,6 +5,25 @@ import java.io.Serializable; +/** + * Strategy abstraction for building a Flink {@link WatermarkStrategy} over Dagger's {@link Row} + * stream. + * + *

Implementations decide how event-time timestamps are extracted from each {@link Row} and how + * watermarks are generated from them — for example from the last column, from a named + * {@code rowtime} column, or not at all. The strategy is parameterized by an allowed + * out-of-orderness delay so that late events within that bound are still treated as on time. It + * extends {@link Serializable} because Flink ships the strategy to its distributed operators. + */ public interface WatermarkStrategyDefinition extends Serializable { + + /** + * Builds the Flink watermark strategy to assign to the {@link Row} stream. + * + * @param waterMarkDelayInMs the allowed out-of-orderness, in milliseconds, used when generating + * bounded watermarks; implementations that emit no watermarks may + * ignore this value + * @return the watermark strategy to assign to the stream + */ WatermarkStrategy getWatermarkStrategy(long waterMarkDelayInMs); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/DaggerSqlJobBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/DaggerSqlJobBuilder.java index b5c65ee90..bcf984717 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/DaggerSqlJobBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/DaggerSqlJobBuilder.java @@ -41,15 +41,45 @@ import static com.gotocompany.dagger.functions.common.Constants.PYTHON_UDF_ENABLE_KEY; import static org.apache.flink.table.api.Expressions.$; +/** + * Default {@link JobBuilder} implementation that assembles and runs a Dagger SQL job on Flink. + * + *

The builder is driven by {@link KafkaProtoSQLProcessor} and wires together every stage of a + * Dagger pipeline: it configures the Flink execution and table environments, registers source + * streams (with their watermark strategies and pre-processors), registers user-defined functions, + * runs the configured SQL query, applies post-processors and finally attaches the sink. Each + * {@code register*} step returns {@code this} so the registration stages can be chained fluently. + */ public class DaggerSqlJobBuilder implements JobBuilder { + /** + * Dagger configuration that supplies all Flink, source, SQL and processor settings. + */ private final Configuration configuration; + /** + * Flink streaming execution environment configured and used to run the job. + */ private final StreamExecutionEnvironment executionEnvironment; + /** + * Flink table environment used to register sources and functions and to run the SQL query. + */ private final StreamTableEnvironment tableEnvironment; + /** + * Exporter that publishes pipeline metrics gathered from sources, processors and sinks. + */ private final MetricsTelemetryExporter telemetryExporter = new MetricsTelemetryExporter(); + /** + * Orchestrates the Stencil client used to resolve Protobuf descriptors at runtime. + */ private StencilClientOrchestrator stencilClientOrchestrator; + /** + * Reporter that emits StatsD metrics for the running job. + */ private DaggerStatsDReporter daggerStatsDReporter; + /** + * Context holding the configuration together with the Flink execution and table environments. + */ private final DaggerContext daggerContext; /** @@ -119,10 +149,28 @@ public JobBuilder registerSourceWithPreProcessors() { return this; } + /** + * Selects the watermark strategy definition to apply when registering a source stream. + * + * @param enablePerPartitionWatermark whether per-partition watermarks are enabled; when + * {@code true} a {@link LastColumnWatermark} is used, + * otherwise a {@link NoWatermark} is returned + * @return the watermark strategy definition matching the requested behaviour + */ private WatermarkStrategyDefinition getSourceWatermarkDefinition(Boolean enablePerPartitionWatermark) { return enablePerPartitionWatermark ? new LastColumnWatermark() : new NoWatermark(); } + /** + * Builds the Flink Table API expressions used to convert a source data stream into a table. + * + *

Every column is projected by its name and the final column is bound to the configured + * row-time attribute so that event-time based SQL operations work as expected. + * + * @param streamInfo the stream metadata holding the ordered column names of the source + * @return an array of {@code ApiExpression}, one per column, with the last element marked as the + * row-time attribute; an empty array when the stream has no columns + */ private ApiExpression[] getApiExpressions(StreamInfo streamInfo) { String rowTimeAttributeName = configuration.getString(Constants.FLINK_ROWTIME_ATTRIBUTE_NAME_KEY, Constants.FLINK_ROWTIME_ATTRIBUTE_NAME_DEFAULT); String[] columnNames = streamInfo.getColumnNames(); @@ -167,6 +215,21 @@ public JobBuilder registerFunctions() throws IOException { return this; } + /** + * Reflectively instantiates a {@link UdfFactory} from its fully-qualified class name. + * + *

The factory class is expected to expose a constructor that accepts a + * {@code StreamTableEnvironment} and a {@link Configuration}, which receive the job's table + * environment and configuration respectively. + * + * @param udfFactoryClassName the fully-qualified name of the UDF factory class to load + * @return the instantiated UDF factory, ready to register its functions + * @throws ClassNotFoundException if the named class cannot be found on the classpath + * @throws NoSuchMethodException if the class has no constructor with the expected signature + * @throws IllegalAccessException if the constructor is not accessible + * @throws InvocationTargetException if the underlying constructor throws an exception + * @throws InstantiationException if the class cannot be instantiated + */ private UdfFactory getUdfFactory(String udfFactoryClassName) throws ClassNotFoundException, NoSuchMethodException, IllegalAccessException, InvocationTargetException, InstantiationException { Class udfFactoryClass = Class.forName(udfFactoryClassName); @@ -212,6 +275,16 @@ protected StreamInfo createStreamInfo(Table table) { return new StreamInfo(stream, table.getSchema().getFieldNames()); } + /** + * Applies the configured post-processors to the given stream in declaration order. + * + *

Post-processors enrich or transform the output {@code Row} records (for example through + * HTTP, GRPC, Elasticsearch, Postgres, longbow or transformer lookups) before the data is + * handed off to the sink. + * + * @param streamInfo the stream produced by the SQL query + * @return the stream after every post-processor has been applied + */ private StreamInfo addPostProcessor(StreamInfo streamInfo) { List postProcessors = PostProcessorFactory.getPostProcessors(daggerContext, stencilClientOrchestrator, streamInfo.getColumnNames(), telemetryExporter); for (PostProcessor postProcessor : postProcessors) { @@ -220,6 +293,13 @@ private StreamInfo addPostProcessor(StreamInfo streamInfo) { return streamInfo; } + /** + * Applies the configured pre-processors for a source table to the given stream in order. + * + * @param streamInfo the stream registered from the source + * @param tableName the name of the source table whose pre-processors should be applied + * @return the stream after every pre-processor has been applied + */ private StreamInfo addPreProcessor(StreamInfo streamInfo, String tableName) { List preProcessors = PreProcessorFactory.getPreProcessors(daggerContext, tableName, telemetryExporter); for (Preprocessor preprocessor : preProcessors) { @@ -228,6 +308,14 @@ private StreamInfo addPreProcessor(StreamInfo streamInfo, String tableName) { return streamInfo; } + /** + * Builds the configured sink and attaches it to the stream as the job's terminal stage. + * + *

A {@code SinkOrchestrator} resolves the sink implementation from the configuration and the + * telemetry exporter is subscribed so that sink metrics are reported. + * + * @param streamInfo the fully processed stream to be written to the sink + */ private void addSink(StreamInfo streamInfo) { SinkOrchestrator sinkOrchestrator = new SinkOrchestrator(telemetryExporter); sinkOrchestrator.addSubscriber(telemetryExporter); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/ExampleStreamApiJobBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/ExampleStreamApiJobBuilder.java index 05c8f6600..530b5c796 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/ExampleStreamApiJobBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/ExampleStreamApiJobBuilder.java @@ -28,27 +28,60 @@ import java.util.List; import java.util.Map; +/** + * Example {@link JobBuilder} that demonstrates assembling a Dagger job with the raw Flink DataStream + * API instead of SQL. + * + *

This is a reference/template implementation (not used in production) showing how to register + * sources with pre-processors, keep references to specific input streams by name, and apply native + * Flink stream operators before sinking the result. The data-processing body is intentionally minimal + * and contains commented-out snippets illustrating common patterns such as keying and aggregation. + */ public class ExampleStreamApiJobBuilder implements JobBuilder { // static final String KEY_PATH = "meta.customer.id"; + /** Name of the first demo input stream this example keeps a reference to. */ private final String inputStreamName1 = "data_streams_0"; + /** Name of the second demo input stream this example keeps a reference to. */ private final String inputStreamName2 = "data_streams_1"; + /** Registered input streams keyed by stream name, populated during source registration. */ private final Map dataStreams = new HashMap<>(); + /** Shared context bundling configuration and the Flink execution environment. */ private final DaggerContext daggerContext; + /** Dagger job configuration resolved from the program arguments. */ private final Configuration configuration; + /** Flink streaming execution environment the job is built on and submitted to. */ private final StreamExecutionEnvironment executionEnvironment; + /** Provides Protobuf/stencil schema descriptors to sources, processors, and sinks. */ private StencilClientOrchestrator stencilClientOrchestrator; + /** StatsD reporter used to emit Dagger metrics. */ private DaggerStatsDReporter daggerStatsDReporter; + /** Collects and publishes job telemetry to subscribers as processors and sinks are added. */ private final MetricsTelemetryExporter telemetryExporter = new MetricsTelemetryExporter(); + /** + * Creates the example job builder bound to the given Dagger context. + * + * @param daggerContext the shared context providing configuration and the Flink execution + * environment + */ public ExampleStreamApiJobBuilder(DaggerContext daggerContext) { this.daggerContext = daggerContext; this.configuration = daggerContext.getConfiguration(); this.executionEnvironment = daggerContext.getExecutionEnvironment(); } + /** + * {@inheritDoc} + * + *

Builds the {@link StencilClientOrchestrator} and {@link DaggerStatsDReporter} and applies a + * minimal set of Flink runtime settings (max parallelism, exactly-once checkpointing, and global + * job parameters) onto the execution environment. + * + * @return this builder, for fluent chaining + */ @Override public JobBuilder registerConfigs() { stencilClientOrchestrator = new StencilClientOrchestrator(configuration); @@ -65,6 +98,15 @@ public JobBuilder registerConfigs() { return this; } + /** + * {@inheritDoc} + * + *

Registers each configured source as a watermark-assigned {@code DataStream} of {@code Row} + * records, runs it through the configured pre-processors, and stores the streams named + * {@code data_streams_0} and {@code data_streams_1} for later use by the example output stage. + * + * @return this builder, for fluent chaining + */ @Override public JobBuilder registerSourceWithPreProcessors() { long watermarkDelay = configuration.getLong(Constants.FLINK_WATERMARK_DELAY_MS_KEY, Constants.FLINK_WATERMARK_DELAY_MS_DEFAULT); @@ -98,11 +140,29 @@ public JobBuilder registerSourceWithPreProcessors() { return this; } + /** + * {@inheritDoc} + * + *

This example registers no user-defined functions and simply returns the builder unchanged. + * + * @return this builder, for fluent chaining + * @throws IOException declared to satisfy the interface; never thrown by this implementation + */ @Override public JobBuilder registerFunctions() throws IOException { return this; } + /** + * {@inheritDoc} + * + *

Takes the registered {@code data_streams_0} input, applies a trivial keyed aggregation using + * the Flink DataStream API as a placeholder for real processing logic, and routes the result to + * the configured sink. Richer keying/aggregation patterns are shown in the commented-out snippets. + * + * @return this builder, for fluent chaining + * @throws NullPointerException if the expected input stream was not registered + */ @Override public JobBuilder registerOutputStream() { // NOTE - GET THE DATASTREAM REFERENCE @@ -142,11 +202,26 @@ public JobBuilder registerOutputStream() { return this; } + /** + * {@inheritDoc} + * + *

Submits the assembled pipeline to the Flink execution environment under the configured job + * name. + * + * @throws Exception if the Flink job fails to submit or execute + */ @Override public void execute() throws Exception { executionEnvironment.execute(configuration.getString(Constants.FLINK_JOB_ID_KEY, Constants.FLINK_JOB_ID_DEFAULT)); } + /** + * Applies all configured pre-processors for a source stream in sequence. + * + * @param streamInfo the stream to pre-process + * @param tableName the name of the source/table whose pre-processors should be applied + * @return the stream after every pre-processor has been applied + */ private StreamInfo addPreProcessor(StreamInfo streamInfo, String tableName) { List preProcessors = PreProcessorFactory.getPreProcessors(daggerContext, tableName, telemetryExporter); for (Preprocessor preprocessor : preProcessors) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/JobBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/JobBuilder.java index a2aa07a08..0112c7356 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/JobBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/JobBuilder.java @@ -17,13 +17,39 @@ */ public interface JobBuilder { + /** + * Initializes the Flink execution and table environments and registers global job settings. + * + * @return this builder, to allow fluent chaining of the registration stages + */ JobBuilder registerConfigs(); + /** + * Registers the configured source streams, assigns their watermarks and applies pre-processors. + * + * @return this builder, to allow fluent chaining of the registration stages + */ JobBuilder registerSourceWithPreProcessors(); + /** + * Registers the user-defined functions (including Python UDFs) used by the SQL query. + * + * @return this builder, to allow fluent chaining of the registration stages + * @throws IOException if a function factory or Python UDF resource cannot be loaded + */ JobBuilder registerFunctions() throws IOException; + /** + * Runs the configured SQL query, applies the post-processors and attaches the configured sink. + * + * @return this builder, to allow fluent chaining of the registration stages + */ JobBuilder registerOutputStream(); + /** + * Submits and runs the assembled Flink job. + * + * @throws Exception if the job fails to execute + */ void execute() throws Exception; } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/KafkaProtoSQLProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/KafkaProtoSQLProcessor.java index 19798d03d..445bd873d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/KafkaProtoSQLProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/KafkaProtoSQLProcessor.java @@ -43,6 +43,17 @@ public static void main(String[] args) throws ProgramInvocationException { } } + /** + * Reflectively creates the {@link JobBuilder} instance configured for the job. + * + *

The implementation class is read from the {@code JOB_BUILDER_FQCN} configuration key and + * must expose a constructor accepting a single {@code DaggerContext} argument. When the class + * cannot be loaded or instantiated, the failure is logged and a {@link DaggerSqlJobBuilder} is + * returned as a fallback. + * + * @param daggerContext the context passed to the job builder's constructor + * @return the configured job builder, or a {@link DaggerSqlJobBuilder} when instantiation fails + */ private static JobBuilder getJobBuilderInstance(DaggerContext daggerContext) { String className = daggerContext.getConfiguration().getString(JOB_BUILDER_FQCN_KEY, Constants.DEFAULT_JOB_BUILDER_FQCN); try { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/CommandlineConfigurationProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/CommandlineConfigurationProvider.java index bd7972ef4..59026f4f6 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/CommandlineConfigurationProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/CommandlineConfigurationProvider.java @@ -12,7 +12,15 @@ */ public class CommandlineConfigurationProvider implements ConfigurationProvider { + /** + * The raw command-line arguments supplied to the Dagger job, later wrapped in Flink's + * {@code ParameterTool}. + */ private String[] args; + /** + * Shared {@link Gson} instance used to decode Base64-encoded program arguments into a + * {@code String[]}. + */ private static final Gson GSON = new Gson(); /** @@ -25,6 +33,14 @@ public CommandlineConfigurationProvider(String[] args) { this.args = args; } + /** + * Builds a {@link Configuration} from the command-line arguments. + * + *

When an {@code encodedArgs} parameter is present, the Base64-encoded arguments are decoded + * and used in place of the raw arguments before being wrapped in Flink's {@code ParameterTool}. + * + * @return the configuration parsed from the (possibly decoded) command-line arguments + */ private Configuration constructParamTool() { String[] finalArgs = args; if (isEncodedArgsPresent()) { @@ -33,17 +49,38 @@ private Configuration constructParamTool() { return new Configuration(ParameterTool.fromArgs(finalArgs)); } + /** + * Checks whether the command-line arguments carry a Base64-encoded {@code encodedArgs} entry. + * + * @return {@code true} if an {@code encodedArgs} argument was supplied, {@code false} otherwise + */ private boolean isEncodedArgsPresent() { String encodedArgs = ParameterTool.fromArgs(args).get("encodedArgs"); return encodedArgs != null; } + /** + * Decodes the Base64-encoded {@code encodedArgs} argument into the actual program arguments. + * + *

The decoded payload is expected to be a JSON-encoded {@code String[]}, which is parsed + * with {@link Gson}. + * + * @return the decoded program arguments as a {@code String[]} + */ private String[] parseEncodedProgramArgs() { String encodedArgs = ParameterTool.fromArgs(args).get("encodedArgs"); byte[] decoded = Base64.getMimeDecoder().decode(encodedArgs); return GSON.fromJson(new String(decoded), String[].class); } + /** + * {@inheritDoc} + * + *

Logs the resolved parameters to standard output and returns the {@link Configuration} + * built from the command-line arguments. + * + * @return the configuration derived from the command-line arguments + */ @Override public Configuration get() { System.out.println("params from " + CommandlineConfigurationProvider.class.getName()); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/ConfigurationProviderFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/ConfigurationProviderFactory.java index 4594ef9df..a05244c25 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/ConfigurationProviderFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/ConfigurationProviderFactory.java @@ -10,8 +10,18 @@ * The Factory class for configuration provider. */ public class ConfigurationProviderFactory { + /** + * Logger used to record the supplied program arguments and the resolved configuration source. + */ private static final Logger LOGGER = LoggerFactory.getLogger(ConfigurationProviderFactory.class); + /** + * The raw command-line arguments forwarded to the selected {@link ConfigurationProvider}. + */ private String[] args; + /** + * System-property key identifying the configuration source, such as {@code "FILE"}, + * {@code "ARGS"}, or {@code "ENVIRONMENT"}. + */ public static final String CONFIG_SOURCE = "ConfigSource"; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/EnvironmentConfigurationProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/EnvironmentConfigurationProvider.java index 1460d00a6..38e5b5898 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/EnvironmentConfigurationProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/EnvironmentConfigurationProvider.java @@ -11,6 +11,10 @@ */ public class EnvironmentConfigurationProvider implements ConfigurationProvider { + /** + * The environment variables used to build the {@link Configuration}, typically the process + * environment from {@code System.getenv()}. + */ private Map environmentParameters; /** @@ -22,6 +26,14 @@ public EnvironmentConfigurationProvider(Map environmentParameter this.environmentParameters = environmentParameters; } + /** + * {@inheritDoc} + * + *

Builds the {@link Configuration} from the supplied environment variables, wrapping them in + * Flink's {@code ParameterTool}. + * + * @return the configuration derived from the environment variables + */ @Override public Configuration get() { return new Configuration(ParameterTool.fromMap(environmentParameters)); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/FileConfigurationProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/FileConfigurationProvider.java index 01b76130f..41d4f52c2 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/FileConfigurationProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/FileConfigurationProvider.java @@ -41,11 +41,22 @@ public FileConfigurationProvider() { this.environmentParameters.entrySet().forEach(t -> System.out.println(t.getKey() + t.getValue())); } + /** + * {@inheritDoc} + * + *

Builds the {@link Configuration} from the properties loaded out of the configuration file, + * wrapping them in Flink's {@code ParameterTool}. + * + * @return the configuration loaded from the properties file + */ @Override public Configuration get() { return new Configuration(ParameterTool.fromMap(this.environmentParameters)); } + /** + * The key/value pairs loaded from the Dagger properties file, used to build the configuration. + */ private Map environmentParameters; } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/KafkaEnvironmentVariables.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/KafkaEnvironmentVariables.java index 2342e29df..477f37300 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/config/KafkaEnvironmentVariables.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/config/KafkaEnvironmentVariables.java @@ -9,6 +9,9 @@ */ public class KafkaEnvironmentVariables { + /** + * Prefix marking Dagger configuration keys that carry raw Kafka consumer properties. + */ private static final String KAFKA_PREFIX = "source_kafka_config_"; /** @@ -31,6 +34,16 @@ public static Properties parse(Configuration configuration) { return props; } + /** + * Converts a prefixed Dagger configuration key into its native Kafka property name. + * + *

The {@code source_kafka_config_} prefix is stripped and the remaining underscore-separated + * segments are lower-cased and joined with dots, so {@code source_kafka_config_bootstrap_servers} + * becomes {@code bootstrap.servers}. + * + * @param varName the prefixed Dagger configuration key to convert + * @return the equivalent dot-separated Kafka property name + */ private static String parseVarName(String varName) { String[] names = varName.toLowerCase().replaceAll(KAFKA_PREFIX, "").split("_"); return String.join(".", names); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerFactory.java index b4b4750e4..2b786bd4c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerFactory.java @@ -13,7 +13,27 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +/** + * Factory that selects and builds the {@link DaggerDeserializer} appropriate for a stream's source + * and data type. + * + *

It evaluates the known providers in order — JSON, Protobuf over Kafka, and Protobuf over + * Parquet — and returns the deserializer from the first provider that can handle the supplied + * {@link StreamConfig}. When no provider matches, the failure is reported through StatsD and a + * {@link DaggerConfigurationException} is thrown. + */ public class DaggerDeserializerFactory { + /** + * Builds the deserializer that matches the given stream configuration. + * + * @param streamConfig the stream configuration describing the source(s) and data type + * @param configuration the job configuration passed through to the providers + * @param stencilClientOrchestrator the Stencil orchestrator used by Protobuf-based providers + * @param statsDReporterSupplier supplies the StatsD reporter used to report a fatal error when + * no compatible deserializer is found + * @return the deserializer produced by the first compatible provider + * @throws DaggerConfigurationException if no provider can deserialize the configured stream + */ public static DaggerDeserializer create(StreamConfig streamConfig, Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator, SerializedStatsDReporterSupplier statsDReporterSupplier) { return getDaggerDeserializerProviders(streamConfig, configuration, stencilClientOrchestrator) .stream() @@ -28,6 +48,17 @@ public static DaggerDeserializer create(StreamConfig streamConfig, Configur .getDaggerDeserializer(); } + /** + * Builds the ordered list of candidate deserializer providers to evaluate. + * + *

The order defines precedence when more than one provider could match: JSON first, then + * Protobuf over Kafka, then Protobuf over Parquet. + * + * @param streamConfig the stream configuration handed to each provider + * @param configuration the job configuration handed to each provider + * @param stencilClientOrchestrator the Stencil orchestrator handed to the Protobuf-based providers + * @return the list of providers in evaluation order + */ private static List> getDaggerDeserializerProviders(StreamConfig streamConfig, Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator) { return Stream.of( new JsonDeserializerProvider(streamConfig), diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerProvider.java index d46f519a8..e01e8d8bc 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/DaggerDeserializerProvider.java @@ -2,7 +2,32 @@ import com.gotocompany.dagger.common.serde.DaggerDeserializer; +/** + * Strategy interface for components that can supply a {@link DaggerDeserializer} for a Dagger + * stream source. + * + *

Each implementation targets a particular combination of source (for example Kafka or Parquet) + * and input schema/data type (for example Protobuf or JSON). The {@link DaggerDeserializerFactory} + * asks every provider whether it {@link #canProvide() can provide} a deserializer for the current + * stream configuration and uses the first one that accepts. + * + * @param the record type produced by the deserializer, typically a Flink + * {@link org.apache.flink.types.Row} + */ public interface DaggerDeserializerProvider { + /** + * Builds the deserializer for the current stream configuration. + * + *

Should only be called when {@link #canProvide()} returns {@code true}. + * + * @return the deserializer that turns raw source records into {@code D} instances + */ DaggerDeserializer getDaggerDeserializer(); + + /** + * Indicates whether this provider can supply a deserializer for the current stream configuration. + * + * @return {@code true} if the configured source and data type are compatible with this provider + */ boolean canProvide(); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/JsonDeserializerProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/JsonDeserializerProvider.java index e7cc9b3c0..700f516a2 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/JsonDeserializerProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/JsonDeserializerProvider.java @@ -13,20 +13,53 @@ import static com.gotocompany.dagger.common.serde.DataTypes.JSON; +/** + * Supplies a {@link JsonDeserializer} for Kafka-backed streams whose payload is JSON encoded. + * + *

This provider accepts a stream only when every configured source is a Kafka source + * ({@link SourceName#KAFKA_SOURCE} or {@link SourceName#KAFKA_CONSUMER}) and the input schema type + * is {@link DataTypes#JSON}. The resulting deserializer maps JSON documents onto Flink + * {@link Row} records using the configured JSON schema and event-timestamp field. + */ public class JsonDeserializerProvider implements DaggerDeserializerProvider { + /** Stream configuration carrying the JSON schema and the event-timestamp field name. */ private final StreamConfig streamConfig; + /** Source types this provider can deserialize: the Kafka source and the Kafka consumer. */ private static final HashSet COMPATIBLE_SOURCES = new HashSet<>(Arrays.asList(SourceName.KAFKA_SOURCE, SourceName.KAFKA_CONSUMER)); + /** Input schema type this provider handles, namely JSON. */ private static final DataTypes COMPATIBLE_INPUT_SCHEMA_TYPE = JSON; + /** + * Creates a JSON deserializer provider for the given stream. + * + * @param streamConfig the stream configuration carrying the JSON schema and the JSON + * event-timestamp field name + */ public JsonDeserializerProvider(StreamConfig streamConfig) { this.streamConfig = streamConfig; } + /** + * {@inheritDoc} + * + *

Constructs a {@link JsonDeserializer} from the configured JSON schema and event-timestamp + * field name. + * + * @return a JSON-to-{@link Row} deserializer for the configured stream + */ @Override public DaggerDeserializer getDaggerDeserializer() { return new JsonDeserializer(streamConfig.getJsonSchema(), streamConfig.getJsonEventTimestampFieldName()); } + /** + * {@inheritDoc} + * + *

Returns {@code true} only when every configured source is a compatible Kafka source and the + * stream's data type is {@link DataTypes#JSON}. + * + * @return {@code true} if this provider can deserialize the configured stream, {@code false} otherwise + */ @Override public boolean canProvide() { SourceDetails[] sourceDetailsList = streamConfig.getSourceDetails(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/ProtoDeserializerProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/ProtoDeserializerProvider.java index e165aeb60..0c3684468 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/ProtoDeserializerProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/ProtoDeserializerProvider.java @@ -16,19 +16,48 @@ import static com.gotocompany.dagger.common.serde.DataTypes.PROTO; +/** + * Supplies a {@link ProtoDeserializer} for Kafka-backed streams whose payload is Protobuf encoded. + * + *

This provider accepts a stream only when every configured source is a Kafka source + * ({@link SourceName#KAFKA_SOURCE} or {@link SourceName#KAFKA_CONSUMER}) and the input schema type + * is {@link DataTypes#PROTO}. The resulting deserializer decodes Protobuf messages into Flink + * {@link Row} records using the configured proto class and the Stencil schema registry. + */ public class ProtoDeserializerProvider implements DaggerDeserializerProvider { + /** Source types this provider can deserialize: the Kafka source and the Kafka consumer. */ private static final HashSet COMPATIBLE_SOURCES = new HashSet<>(Arrays.asList(SourceName.KAFKA_SOURCE, SourceName.KAFKA_CONSUMER)); + /** Input schema type this provider handles, namely Protobuf. */ private static final DataTypes COMPATIBLE_INPUT_SCHEMA_TYPE = PROTO; + /** Stream configuration describing the source(s), proto class, and event-timestamp field. */ protected final StreamConfig streamConfig; + /** Job configuration, used here to resolve the Flink rowtime attribute name. */ protected final Configuration configuration; + /** Orchestrator providing Stencil descriptors for decoding the Protobuf payload. */ protected final StencilClientOrchestrator stencilClientOrchestrator; + /** + * Creates a Protobuf deserializer provider for the given stream. + * + * @param streamConfig the stream configuration carrying source details, the proto + * class name, and the event-timestamp field index + * @param configuration the job configuration used to resolve the rowtime attribute name + * @param stencilClientOrchestrator the Stencil orchestrator supplying Protobuf descriptors + */ public ProtoDeserializerProvider(StreamConfig streamConfig, Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator) { this.streamConfig = streamConfig; this.configuration = configuration; this.stencilClientOrchestrator = stencilClientOrchestrator; } + /** + * {@inheritDoc} + * + *

Constructs a {@link ProtoDeserializer} from the configured proto class, event-timestamp + * field index, and resolved rowtime attribute name, wired to the Stencil orchestrator. + * + * @return a Protobuf-to-{@link Row} deserializer for the configured stream + */ @Override public DaggerDeserializer getDaggerDeserializer() { int timestampFieldIndex = Integer.parseInt(streamConfig.getEventTimestampFieldIndex()); @@ -37,6 +66,14 @@ public DaggerDeserializer getDaggerDeserializer() { return new ProtoDeserializer(protoClassName, timestampFieldIndex, rowTimeAttributeName, stencilClientOrchestrator); } + /** + * {@inheritDoc} + * + *

Returns {@code true} only when every configured source is a compatible Kafka source and the + * stream's data type is {@link DataTypes#PROTO}. + * + * @return {@code true} if this provider can deserialize the configured stream, {@code false} otherwise + */ @Override public boolean canProvide() { SourceDetails[] sourceDetailsList = streamConfig.getSourceDetails(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/SimpleGroupDeserializerProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/SimpleGroupDeserializerProvider.java index 41c853701..9679571a8 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/SimpleGroupDeserializerProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/deserializer/SimpleGroupDeserializerProvider.java @@ -13,19 +13,50 @@ import static com.gotocompany.dagger.common.serde.DataTypes.PROTO; +/** + * Supplies a {@link SimpleGroupDeserializer} for Parquet-backed streams whose records follow a + * Protobuf schema. + * + *

This provider accepts a stream only when every configured source is a + * {@link SourceName#PARQUET_SOURCE} and the input schema type is {@link DataTypes#PROTO}. The + * resulting deserializer converts Parquet {@code SimpleGroup} rows into Flink {@link Row} records + * using the configured proto class and the Stencil schema registry. + */ public class SimpleGroupDeserializerProvider implements DaggerDeserializerProvider { + /** Stream configuration describing the source(s), proto class, and event-timestamp field. */ protected final StreamConfig streamConfig; + /** Job configuration, used here to resolve the Flink rowtime attribute name. */ protected final Configuration configuration; + /** Orchestrator providing Stencil descriptors for interpreting the Parquet rows. */ protected final StencilClientOrchestrator stencilClientOrchestrator; + /** The single source type this provider handles, namely the Parquet source. */ private static final SourceName COMPATIBLE_SOURCE = SourceName.PARQUET_SOURCE; + /** Input schema type this provider handles, namely Protobuf. */ private static final DataTypes COMPATIBLE_INPUT_SCHEMA_TYPE = PROTO; + /** + * Creates a Parquet {@code SimpleGroup} deserializer provider for the given stream. + * + * @param streamConfig the stream configuration carrying source details, the proto + * class name, and the event-timestamp field index + * @param configuration the job configuration used to resolve the rowtime attribute name + * @param stencilClientOrchestrator the Stencil orchestrator supplying Protobuf descriptors + */ public SimpleGroupDeserializerProvider(StreamConfig streamConfig, Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator) { this.streamConfig = streamConfig; this.configuration = configuration; this.stencilClientOrchestrator = stencilClientOrchestrator; } + /** + * {@inheritDoc} + * + *

Constructs a {@link SimpleGroupDeserializer} from the configured proto class, + * event-timestamp field index, and resolved rowtime attribute name, wired to the Stencil + * orchestrator. + * + * @return a Parquet-{@code SimpleGroup}-to-{@link Row} deserializer for the configured stream + */ @Override public DaggerDeserializer getDaggerDeserializer() { int timestampFieldIndex = Integer.parseInt(streamConfig.getEventTimestampFieldIndex()); @@ -34,6 +65,14 @@ public DaggerDeserializer getDaggerDeserializer() { return new SimpleGroupDeserializer(protoClassName, timestampFieldIndex, rowTimeAttributeName, stencilClientOrchestrator); } + /** + * {@inheritDoc} + * + *

Returns {@code true} only when every configured source is the + * {@link SourceName#PARQUET_SOURCE} and the stream's data type is {@link DataTypes#PROTO}. + * + * @return {@code true} if this provider can deserialize the configured stream, {@code false} otherwise + */ @Override public boolean canProvide() { SourceDetails[] sourceDetailsList = streamConfig.getSourceDetails(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/enumeration/KafkaConnectorTypesMetadata.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/enumeration/KafkaConnectorTypesMetadata.java index 9a7739df0..727e72d5a 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/enumeration/KafkaConnectorTypesMetadata.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/enumeration/KafkaConnectorTypesMetadata.java @@ -2,15 +2,41 @@ import java.util.regex.Pattern; +/** + * Enumerates the two kinds of Kafka connector that Dagger configures, each paired with the prefix + * used to namespace its configuration keys. + * + *

Dagger reads Kafka client settings from a flat configuration in which each key is prefixed + * according to whether it applies to the consumer (source) or producer (sink). This enum captures + * that prefix as a regex fragment so the matching consumer/producer properties can be extracted from + * the overall configuration. The constants are {@code SOURCE} (consumer settings prefixed with + * {@code SOURCE_KAFKA_CONSUMER_CONFIG_}) and {@code SINK} (producer settings prefixed with + * {@code SINK_KAFKA_PRODUCER_CONFIG_}). + */ public enum KafkaConnectorTypesMetadata { SOURCE("SOURCE_KAFKA_CONSUMER_CONFIG_+"), SINK("SINK_KAFKA_PRODUCER_CONFIG_+"); + /** + * Associates a connector type with the prefix pattern that identifies its configuration keys. + * + * @param prefixPattern the leading regex fragment matching this connector's configuration keys + */ KafkaConnectorTypesMetadata(String prefixPattern) { this.prefixPattern = prefixPattern; } + /** Leading regex fragment matching the configuration keys that belong to this connector type. */ private final String prefixPattern; + /** + * Builds a case-insensitive {@link Pattern} matching configuration keys for this connector type + * and capturing the remainder of each key. + * + *

The returned pattern anchors on this type's prefix and exposes everything after it as a + * capturing group, so callers can strip the prefix and recover the underlying Kafka property name. + * + * @return a compiled, case-insensitive pattern for this connector's configuration keys + */ public Pattern getConfigurationPattern() { return Pattern.compile(String.format("^%s(.*)", prefixPattern), Pattern.CASE_INSENSITIVE); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/BigQueryWriterException.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/BigQueryWriterException.java index 5738bd827..5e60d920c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/BigQueryWriterException.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/BigQueryWriterException.java @@ -2,12 +2,31 @@ import java.io.IOException; +/** + * Checked exception signalling that writing records to BigQuery failed. + * + *

The BigQuery sink flushes batches of rows and inspects the resulting {@code SinkResponse}; when + * the response contains non-retryable errors this exception (extending {@link IOException}) is thrown + * from {@code BigQuerySinkWriter} so the failed flush propagates as a sink error rather than being + * silently dropped. + */ public class BigQueryWriterException extends IOException { + /** + * Creates a new exception with a message and the underlying cause. + * + * @param message human-readable detail describing the BigQuery write failure + * @param cause the underlying error that caused the write to fail + */ public BigQueryWriterException(String message, Throwable cause) { super(message, cause); } + /** + * Creates a new exception with a message describing the BigQuery write failure. + * + * @param message human-readable detail describing the BigQuery write failure + */ public BigQueryWriterException(String message) { super(message); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InfluxWriteException.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InfluxWriteException.java index 52082f514..5c49b68bd 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InfluxWriteException.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InfluxWriteException.java @@ -2,7 +2,20 @@ import java.io.IOException; +/** + * Checked exception signalling that writing points to InfluxDB failed. + * + *

The InfluxDB sink batches measurement points and flushes them asynchronously; when the InfluxDB + * client reports a write failure, the error handlers wrap the underlying cause in this exception + * (extending {@link IOException}). It is surfaced by the InfluxDB writer - for example during a + * checkpoint snapshot - and reported as a fatal exception so failed writes are not silently dropped. + */ public class InfluxWriteException extends IOException { + /** + * Creates a new exception wrapping the underlying InfluxDB write failure. + * + * @param err the underlying error reported by the InfluxDB client + */ public InfluxWriteException(Throwable err) { super(err); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidDaggerSourceException.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidDaggerSourceException.java index 8badd85c9..f40b9d037 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidDaggerSourceException.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidDaggerSourceException.java @@ -1,6 +1,20 @@ package com.gotocompany.dagger.core.exception; +/** + * Unchecked exception signalling that no Dagger source could be created for the configured source + * details. + * + *

{@code DaggerSourceFactory} inspects the {@code SOURCE_DETAILS} stream configuration and selects + * a matching source implementation (Kafka, Parquet, and so on). When none of the known sources can + * satisfy the configuration this exception is thrown and reported as a fatal exception, since the job + * has no valid input to read from. + */ public class InvalidDaggerSourceException extends RuntimeException { + /** + * Creates a new exception describing why a Dagger source could not be created. + * + * @param message human-readable detail, typically echoing the unsupported source configuration + */ public InvalidDaggerSourceException(String message) { super(message); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidTimeRangeException.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidTimeRangeException.java index 54b4ef911..511b77d4c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidTimeRangeException.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/InvalidTimeRangeException.java @@ -1,6 +1,20 @@ package com.gotocompany.dagger.core.exception; +/** + * Unchecked exception signalling that a configured time range is invalid. + * + *

Dagger's bounded (Parquet) sources accept time ranges that bound which files are processed. + * This exception is raised while parsing that configuration (for example by + * {@code FileDateRangeAdaptor}) when a range is malformed - such as a start instant that is after the + * end instant, a value that is not a comma-separated pair of ISO-8601 timestamps, or a timestamp that + * cannot be parsed with the supported date formats. + */ public class InvalidTimeRangeException extends RuntimeException { + /** + * Creates a new exception describing why the time range is invalid. + * + * @param message human-readable detail describing the malformed time range + */ public InvalidTimeRangeException(String message) { super(message); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/ParquetFileSourceReaderInitializationException.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/ParquetFileSourceReaderInitializationException.java index 35ba8b023..828e0ca30 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/ParquetFileSourceReaderInitializationException.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/ParquetFileSourceReaderInitializationException.java @@ -4,6 +4,12 @@ * This exception is thrown when the reader for Parquet FileSource could not be initialized. */ public class ParquetFileSourceReaderInitializationException extends RuntimeException { + /** + * Instantiates a new Parquet file source reader initialization exception wrapping the cause. + * + * @param cause the underlying error that prevented the Parquet {@code FileSource} reader from + * being initialized + */ public ParquetFileSourceReaderInitializationException(Throwable cause) { super(cause); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/PathParserNotProvidedException.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/PathParserNotProvidedException.java index 1d4a02609..6f84df616 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/PathParserNotProvidedException.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/exception/PathParserNotProvidedException.java @@ -1,6 +1,20 @@ package com.gotocompany.dagger.core.exception; +/** + * Unchecked exception signalling that a Parquet file-path parser was required but none was supplied. + * + *

When Dagger reads bounded/Parquet sources it derives the chronological ordering of file splits + * from the timestamp encoded in each file path, which requires a configured path parser. If the + * parser is missing this exception is raised (for example from + * {@code ChronologyOrderedSplitAssigner} with the message {@code "Path parser is null"}) and reported + * as a fatal exception, so the job cannot proceed with an undefined split ordering. + */ public class PathParserNotProvidedException extends RuntimeException { + /** + * Creates a new exception indicating that a required path parser was not provided. + * + * @param message human-readable detail describing the missing path parser + */ public PathParserNotProvidedException(String message) { super(message); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ChronologyOrderedSplitAssignerAspects.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ChronologyOrderedSplitAssignerAspects.java index 860b0f829..96b6bf3b3 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ChronologyOrderedSplitAssignerAspects.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ChronologyOrderedSplitAssignerAspects.java @@ -3,24 +3,64 @@ import com.gotocompany.dagger.common.metrics.aspects.AspectType; import com.gotocompany.dagger.common.metrics.aspects.Aspects; +/** + * Metric aspects emitted by the chronology-ordered split assigner of the Parquet/file source. + * + *

The chronology-ordered split assigner hands {@code FileSourceSplit}s to Flink source readers in + * timestamp order. These aspects expose how many splits have been discovered, recorded as pending + * work, and are still awaiting assignment; they are published through the StatsD-backed gauge manager + * under the split-assigner component tag. + */ public enum ChronologyOrderedSplitAssignerAspects implements Aspects { + /** + * Gauge tracking the total number of file splits discovered and handed to the assigner. + */ TOTAL_SPLITS_DISCOVERED("total_splits_discovered", AspectType.Gauge), + /** + * Gauge tracking the number of splits the assigner has recorded as pending work. + */ TOTAL_SPLITS_RECORDED("total_splits_recorded", AspectType.Gauge), + /** + * Tracks how many recorded splits are still waiting to be assigned to a source reader. + */ SPLITS_AWAITING_ASSIGNMENT("splits_awaiting_assignment", AspectType.Counter); + /** + * Binds the metric name and metric type for this aspect constant. + * + * @param value the metric name to publish on the Flink metric group + * @param aspectType the kind of metric this aspect is reported as + */ ChronologyOrderedSplitAssignerAspects(String value, AspectType aspectType) { this.value = value; this.aspectType = aspectType; } + /** The metric name published for this aspect on the Flink metric group. */ private final String value; + /** The kind of metric this aspect is reported as; see {@link AspectType}. */ private final AspectType aspectType; + /** + * {@inheritDoc} + * + *

Returns the metric name configured for this constant (the first constructor argument). + * + * @return the metric name for this aspect + */ @Override public String getValue() { return value; } + /** + * {@inheritDoc} + * + *

Returns the {@link AspectType} configured for this constant, which controls whether it is + * registered and reported as a meter, histogram, gauge, or counter. + * + * @return the metric type for this aspect + */ @Override public AspectType getAspectType() { return aspectType; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ExternalSourceAspects.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ExternalSourceAspects.java index 8a9016b82..4d96087f2 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ExternalSourceAspects.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ExternalSourceAspects.java @@ -7,39 +7,118 @@ * The enum External source aspects. */ public enum ExternalSourceAspects implements Aspects { + /** + * Counts how often a connection to the external client is closed. + */ CLOSE_CONNECTION_ON_EXTERNAL_CLIENT("close_connection_on_external_client", AspectType.Metric), + /** + * Counts failures encountered while reading the configured response path. + */ FAILURES_ON_READING_PATH("failures_on_reading_path", AspectType.Metric), + /** + * Counts the total number of failed external requests. + */ TOTAL_FAILED_REQUESTS("total_failures", AspectType.Metric), + /** + * Counts the number of external calls that timed out. + */ TIMEOUTS("timeouts", AspectType.Metric), + /** + * Records the response-time distribution for successful external calls. + */ SUCCESS_RESPONSE_TIME("success_response_time", AspectType.Histogram), + /** + * Records the response-time distribution for failed external calls. + */ FAILURES_RESPONSE_TIME("failure_response_time", AspectType.Histogram), + /** + * Counts the number of successful responses received from the external source. + */ SUCCESS_RESPONSE("success_response", AspectType.Metric), + /** + * Counts errors encountered while parsing an external response. + */ ERROR_PARSING_RESPONSE("parse_errors", AspectType.Metric), + /** + * Counts errors raised while issuing an external request. + */ REQUEST_ERROR("request_error", AspectType.Metric), + /** + * Counts miscellaneous errors that do not fall into a more specific category. + */ OTHER_ERRORS("other_errors", AspectType.Metric), + /** + * Counts the total number of external calls attempted. + */ TOTAL_EXTERNAL_CALLS("total_external_calls", AspectType.Metric), + /** + * Counts records received with empty input for the external call. + */ EMPTY_INPUT("empty_input", AspectType.Metric), + /** + * Counts errors encountered while reading an external response. + */ ERROR_READING_RESPONSE("error_reading_response", AspectType.Metric), + /** + * Counts other errors encountered while processing an external response. + */ OTHER_ERRORS_PROCESSING_RESPONSE("other_errors_processing_response", AspectType.Metric), + /** + * Counts occurrences of invalid external source configuration. + */ INVALID_CONFIGURATION("invalid_configuration", AspectType.Metric), + /** + * Counts external responses that failed with a 5XX server error status code. + */ FAILURE_CODE_5XX("failures_code5XX", AspectType.Metric), + /** + * Counts external responses that failed with a 4XX client error status code. + */ FAILURE_CODE_4XX("failures_code4XX", AspectType.Metric), + /** + * Counts external responses that failed with a 404 not found status code. + */ FAILURE_CODE_404("failures_code404", AspectType.Metric), + /** + * Counts occurrences where the gRPC channel was not available for an external call. + */ GRPC_CHANNEL_NOT_AVAILABLE("grpc_channel_not_available", AspectType.Metric); + /** + * The metric identifier reported for this aspect. + */ private String value; + /** + * The metric type this aspect represents. + */ private AspectType aspectType; + /** + * Instantiates a new external source aspect. + * + * @param value the metric identifier for this aspect + * @param aspectType the metric type this aspect represents + */ ExternalSourceAspects(String value, AspectType aspectType) { this.value = value; this.aspectType = aspectType; } + /** + * Returns the metric identifier reported for this aspect. + * + * @return the metric identifier configured for this aspect + */ @Override public String getValue() { return value; } + /** + * Returns the metric type associated with this aspect. + * + * @return the metric type configured for this aspect + */ @Override public AspectType getAspectType() { return aspectType; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowReaderAspects.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowReaderAspects.java index e515e2b8b..df6a4822f 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowReaderAspects.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowReaderAspects.java @@ -7,28 +7,74 @@ * The enum Longbow reader aspects. */ public enum LongbowReaderAspects implements Aspects { + /** + * Counts the number of Longbow reader operations that timed out. + */ TIMEOUTS_ON_READER("timeouts_on_reader", AspectType.Metric), + /** + * Counts how often the Longbow reader connection is closed. + */ CLOSE_CONNECTION_ON_READER("close_connection_on_reader", AspectType.Metric), + /** + * Counts documents successfully read by the Longbow reader. + */ SUCCESS_ON_READ_DOCUMENT("success_on_read_document", AspectType.Metric), + /** + * Records the response-time distribution for successful Longbow read operations. + */ SUCCESS_ON_READ_DOCUMENT_RESPONSE_TIME("success_on_read_document_response_time", AspectType.Histogram), + /** + * Records the distribution of documents read per Longbow scan. + */ DOCUMENTS_READ_PER_SCAN("documents_read_per_scan", AspectType.Histogram), + /** + * Counts documents that failed to be read by the Longbow reader. + */ FAILED_ON_READ_DOCUMENT("failed_on_read_document", AspectType.Metric), + /** + * Records the response-time distribution for failed Longbow read operations. + */ FAILED_ON_READ_DOCUMENT_RESPONSE_TIME("failed_on_read_document_response_time", AspectType.Histogram), + /** + * Counts failures to read the last record during Longbow lookups. + */ FAILED_TO_READ_LAST_RECORD("failed_to_read_last_record", AspectType.Metric); + /** + * The metric identifier reported for this aspect. + */ private String value; + /** + * The metric type this aspect represents. + */ private AspectType aspectType; + /** + * Instantiates a new Longbow reader aspect. + * + * @param value the metric identifier for this aspect + * @param aspectType the metric type this aspect represents + */ LongbowReaderAspects(String value, AspectType aspectType) { this.value = value; this.aspectType = aspectType; } + /** + * Returns the metric identifier reported for this aspect. + * + * @return the metric identifier configured for this aspect + */ @Override public String getValue() { return value; } + /** + * Returns the metric type associated with this aspect. + * + * @return the metric type configured for this aspect + */ @Override public AspectType getAspectType() { return aspectType; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowWriterAspects.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowWriterAspects.java index ca7414a3c..b3b7fa4c2 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowWriterAspects.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/LongbowWriterAspects.java @@ -7,30 +7,82 @@ * The enum Longbow writer aspects. */ public enum LongbowWriterAspects implements Aspects { + /** + * Counts successful BigTable table creations by the Longbow writer. + */ SUCCESS_ON_CREATE_BIGTABLE("success_on_create_bigtable", AspectType.Metric), + /** + * Records the response-time distribution for successful BigTable creations. + */ SUCCESS_ON_CREATE_BIGTABLE_RESPONSE_TIME("success_on_create_bigtable_response_time", AspectType.Histogram), + /** + * Counts failed BigTable table creations by the Longbow writer. + */ FAILURES_ON_CREATE_BIGTABLE("failed_on_create_bigtable", AspectType.Metric), + /** + * Records the response-time distribution for failed BigTable creations. + */ FAILURES_ON_CREATE_BIGTABLE_RESPONSE_TIME("failed_on_create_bigtable_response_time", AspectType.Histogram), + /** + * Counts the number of Longbow writer operations that timed out. + */ TIMEOUTS_ON_WRITER("timeouts_on_writer", AspectType.Metric), + /** + * Counts how often the Longbow writer connection is closed. + */ CLOSE_CONNECTION_ON_WRITER("close_connection_on_writer", AspectType.Metric), + /** + * Counts documents successfully written by the Longbow writer. + */ SUCCESS_ON_WRITE_DOCUMENT("success_on_write_document", AspectType.Metric), + /** + * Records the response-time distribution for successful Longbow write operations. + */ SUCCESS_ON_WRITE_DOCUMENT_RESPONSE_TIME("success_on_write_document_response_time", AspectType.Histogram), + /** + * Counts documents that failed to be written by the Longbow writer. + */ FAILED_ON_WRITE_DOCUMENT("failed_on_write_document", AspectType.Metric), + /** + * Records the response-time distribution for failed Longbow write operations. + */ FAILED_ON_WRITE_DOCUMENT_RESPONSE_TIME("failed_on_write_document_response_time", AspectType.Histogram); + /** + * The metric identifier reported for this aspect. + */ private String value; + /** + * The metric type this aspect represents. + */ private AspectType aspectType; + /** + * Instantiates a new Longbow writer aspect. + * + * @param value the metric identifier for this aspect + * @param aspectType the metric type this aspect represents + */ LongbowWriterAspects(String value, AspectType aspectType) { this.value = value; this.aspectType = aspectType; } + /** + * Returns the metric identifier reported for this aspect. + * + * @return the metric identifier configured for this aspect + */ @Override public String getValue() { return value; } + /** + * Returns the metric type associated with this aspect. + * + * @return the metric type configured for this aspect + */ @Override public AspectType getAspectType() { return aspectType; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ParquetReaderAspects.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ParquetReaderAspects.java index d1b6827a1..2ed1b8fbd 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ParquetReaderAspects.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/ParquetReaderAspects.java @@ -3,26 +3,75 @@ import com.gotocompany.dagger.common.metrics.aspects.AspectType; import com.gotocompany.dagger.common.metrics.aspects.Aspects; +/** + * Metric aspects emitted by the Parquet file source reader as it reads and deserializes rows. + * + *

When Dagger sources data from Parquet files, each reader iterates a Parquet file, reads + * {@code SimpleGroup} records, and deserializes them into Flink {@code Row} instances emitted + * downstream. The constants below name the counters and histograms — reader lifecycle, rows emitted, + * and per-row read/deserialization timings — that the StatsD-backed counter and histogram managers + * report. The {@link AspectType} on each constant decides whether it is reported as a counter or a + * histogram. + */ public enum ParquetReaderAspects implements Aspects { + /** + * Counter incremented once each time a Parquet reader instance is created. + */ READER_CREATED("reader_created", AspectType.Counter), + /** + * Counter incremented once each time a Parquet reader instance is closed. + */ READER_CLOSED("reader_closed", AspectType.Counter), + /** + * Counter incremented for every row the reader emits downstream. + */ READER_ROWS_EMITTED("reader_rows_emitted", AspectType.Counter), + /** + * Histogram of the time, in milliseconds, taken to deserialize a single record into a Flink + * {@code Row}. + */ READER_ROW_DESERIALIZATION_TIME("reader_row_deserialization_time", AspectType.Histogram), + /** + * Histogram of the time, in milliseconds, taken to read a single record from the Parquet file. + */ READER_ROW_READ_TIME("reader_row_read_time", AspectType.Histogram); + /** The metric name published for this aspect on the Flink metric group. */ private final String value; + /** The kind of metric this aspect is reported as; see {@link AspectType}. */ private final AspectType aspectType; + /** + * Binds the metric name and metric type for this aspect constant. + * + * @param value the metric name to publish on the Flink metric group + * @param aspectType the kind of metric this aspect is reported as + */ ParquetReaderAspects(String value, AspectType aspectType) { this.value = value; this.aspectType = aspectType; } + /** + * {@inheritDoc} + * + *

Returns the metric name configured for this constant (the first constructor argument). + * + * @return the metric name for this aspect + */ @Override public String getValue() { return value; } + /** + * {@inheritDoc} + * + *

Returns the {@link AspectType} configured for this constant, which controls whether it is + * registered and reported as a meter, histogram, gauge, or counter. + * + * @return the metric type for this aspect + */ @Override public AspectType getAspectType() { return aspectType; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/TelemetryAspects.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/TelemetryAspects.java index e8daed1b7..d28344af3 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/TelemetryAspects.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/aspects/TelemetryAspects.java @@ -12,19 +12,41 @@ public enum TelemetryAspects implements Aspects { */ VALUE("value", AspectType.Metric); + /** + * The metric identifier reported for this aspect. + */ private String value; + /** + * The metric type this aspect represents. + */ private AspectType aspectType; + /** + * Instantiates a new telemetry aspect. + * + * @param value the metric identifier for this aspect + * @param aspectType the metric type this aspect represents + */ TelemetryAspects(String value, AspectType aspectType) { this.value = value; this.aspectType = aspectType; } + /** + * Returns the metric identifier reported for this aspect. + * + * @return the metric identifier configured for this aspect + */ @Override public String getValue() { return value; } + /** + * Returns the metric type associated with this aspect. + * + * @return the metric type configured for this aspect + */ @Override public AspectType getAspectType() { return aspectType; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorReporter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorReporter.java index 68ea0ba2b..4a5084626 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorReporter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorReporter.java @@ -21,6 +21,18 @@ public interface ErrorReporter { void reportNonFatalException(Exception exception); + /** + * Registers and returns a {@link Counter} that tracks occurrences of the given exception. + * + *

The counter is nested under {@code metricGroupKey} within the supplied + * {@link MetricGroup} and is further keyed by the exception's fully-qualified class name, + * allowing failures to be aggregated per exception type. + * + * @param exception the exception whose occurrences are counted + * @param metricGroup the Flink metric group the counter is registered under + * @param metricGroupKey the key used to group the exception counter + * @return the counter tracking occurrences of the given exception type + */ default Counter addExceptionToCounter(Exception exception, MetricGroup metricGroup, String metricGroupKey) { return metricGroup.addGroup(metricGroupKey, exception.getClass().getName()).counter("value"); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorStatsReporter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorStatsReporter.java index 1293b5509..3ee1fe9df 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorStatsReporter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/ErrorStatsReporter.java @@ -12,15 +12,40 @@ * The Error stats reporter. */ public class ErrorStatsReporter implements ErrorReporter { + /** + * The Flink metric group under which exception counters are registered. + */ private MetricGroup metricGroup; + /** + * The time in milliseconds to sleep after reporting a fatal exception, allowing the metric to + * be flushed before the job shuts down. + */ private long shutDownPeriod; + /** + * Logger used to record interruptions that occur while waiting during fatal exception reporting. + */ private static final Logger LOGGER = LoggerFactory.getLogger(MetricsTelemetryExporter.class.getName()); + /** + * Instantiates a new error stats reporter. + * + * @param metricGroup the Flink metric group used to register exception counters + * @param shutDownPeriod the time in milliseconds to wait after reporting a fatal exception + */ public ErrorStatsReporter(MetricGroup metricGroup, long shutDownPeriod) { this.metricGroup = metricGroup; this.shutDownPeriod = shutDownPeriod; } + /** + * Increments the fatal-exception counter for the given exception and then pauses for the + * configured shutdown period. + * + *

The pause gives the metric reporter time to publish the recorded failure before the job + * is torn down. An interruption during the wait is logged and otherwise ignored. + * + * @param exception the fatal exception to record + */ @Override public void reportFatalException(Exception exception) { Counter counter = addExceptionToCounter(exception, metricGroup, Constants.FATAL_EXCEPTION_METRIC_GROUP_KEY); @@ -32,6 +57,11 @@ public void reportFatalException(Exception exception) { } } + /** + * Increments the non-fatal-exception counter for the given exception. + * + * @param exception the non-fatal exception to record + */ @Override public void reportNonFatalException(Exception exception) { Counter counter = addExceptionToCounter(exception, metricGroup, Constants.NONFATAL_EXCEPTION_METRIC_GROUP_KEY); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/NoOpErrorReporter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/NoOpErrorReporter.java index 1d8633723..014c07c9e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/NoOpErrorReporter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/NoOpErrorReporter.java @@ -4,11 +4,21 @@ * The No op error reporter. */ public class NoOpErrorReporter implements ErrorReporter { + /** + * No-op implementation that intentionally ignores the fatal exception. + * + * @param exception the exception, which is ignored + */ @Override public void reportFatalException(Exception exception) { } + /** + * No-op implementation that intentionally ignores the non-fatal exception. + * + * @param exception the exception, which is ignored + */ @Override public void reportNonFatalException(Exception exception) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerMetricsConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerMetricsConfig.java index e58bad50b..26f5b5e9b 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerMetricsConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerMetricsConfig.java @@ -5,14 +5,39 @@ import org.apache.flink.configuration.ConfigOptions; import org.apache.flink.configuration.Configuration; +/** + * {@link MetricsConfig} implementation that sources StatsD connection settings from the Flink job + * {@link Configuration}. + * + *

Dagger publishes its metrics over StatsD through the depot reporter, which is configured by + * this adapter. The StatsD host and port are read from the Flink configuration keys + * {@code metrics.reporter.stsd.host} and {@code metrics.reporter.stsd.port}, falling back to + * {@code localhost} and {@code 8125} respectively when they are not set. {@link #getMetricStatsDTags()} + * returns an empty string because Dagger's global/extra tags are supplied separately when the + * reporter is built. + * + * @see DaggerStatsDReporter + */ public class DaggerMetricsConfig implements MetricsConfig { + /** Flink configuration key holding the StatsD host name. */ private static final String FLINK_STATSD_HOST_CONFIG_KEY = "metrics.reporter.stsd.host"; + /** Host used when the StatsD host key is absent from the Flink configuration. */ private static final String DEFAULT_STATSD_HOST_VALUE = "localhost"; + /** Flink configuration key holding the StatsD port. */ private static final String FLINK_STATSD_PORT_CONFIG_KEY = "metrics.reporter.stsd.port"; + /** Port used when the StatsD port key is absent from the Flink configuration. */ private static final int DEFAULT_STATSD_PORT_VALUE = 8125; + /** Resolved StatsD host name. */ private final String hostName; + /** Resolved StatsD port. */ private final int port; + /** + * Resolves the StatsD host and port from the supplied Flink configuration, applying the defaults + * when the keys are missing. + * + * @param flinkConfiguration the Flink job configuration to read the StatsD host and port from + */ public DaggerMetricsConfig(Configuration flinkConfiguration) { ConfigOption hostConfigOption = ConfigOptions .key(FLINK_STATSD_HOST_CONFIG_KEY) @@ -26,16 +51,34 @@ public DaggerMetricsConfig(Configuration flinkConfiguration) { this.port = flinkConfiguration.getInteger(portConfigOption); } + /** + * Returns the StatsD host name to which metrics are sent. + * + * @return the configured host, or {@code localhost} when unset + */ @Override public String getMetricStatsDHost() { return hostName; } + /** + * Returns the StatsD port to which metrics are sent. + * + * @return the configured port, or {@code 8125} when unset + */ @Override public Integer getMetricStatsDPort() { return port; } + /** + * Returns the static tags configured directly on the metrics config. + * + *

Always an empty string for Dagger; global tags such as the job id are instead attached as + * extra tags when the reporter is constructed in {@link DaggerStatsDReporter}. + * + * @return an empty string, indicating no statically-configured tags + */ @Override public String getMetricStatsDTags() { return ""; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerStatsDReporter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerStatsDReporter.java index a05484a23..c1accefb9 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerStatsDReporter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/DaggerStatsDReporter.java @@ -12,16 +12,45 @@ import static com.gotocompany.dagger.core.utils.Constants.FLINK_JOB_ID_DEFAULT; import static com.gotocompany.dagger.core.utils.Constants.FLINK_JOB_ID_KEY; +/** + * Default {@link SerializedStatsDReporterSupplier} that lazily builds and shares a single depot + * {@link StatsDReporter} for the whole Dagger job. + * + *

Because the supplier is serialized into the Flink job graph, only the two configuration objects + * it needs are kept as fields; the actual reporter is created on demand in + * {@link #buildStatsDReporter()}. The reporter is held in a {@code static} field so that every task + * running in the same JVM/TaskManager reuses one instance, configured with a {@link DaggerMetricsConfig} + * and the job's global tags (currently the Flink job id). Instances of this supplier are created + * through the nested {@link Provider} factory. + */ public class DaggerStatsDReporter implements SerializedStatsDReporterSupplier { + /** Process-wide, lazily-initialized reporter shared by every task in the JVM. */ private static StatsDReporter statsDReporter; + /** Flink configuration used to resolve the StatsD host/port via {@link DaggerMetricsConfig}. */ private final Configuration flinkConfiguration; + /** Dagger configuration used to derive global tag values such as the job id. */ private final com.gotocompany.dagger.common.configuration.Configuration daggerConfiguration; + /** + * Creates a supplier bound to the given Flink and Dagger configurations. + * + * @param flinkConfiguration Flink configuration providing the StatsD host and port + * @param daggerConfiguration Dagger configuration providing global tag values such as the job id + */ private DaggerStatsDReporter(Configuration flinkConfiguration, com.gotocompany.dagger.common.configuration.Configuration daggerConfiguration) { this.flinkConfiguration = flinkConfiguration; this.daggerConfiguration = daggerConfiguration; } + /** + * Builds the global tags attached to every metric emitted through the shared reporter. + * + *

Currently this is just the {@link GlobalTags#JOB_ID} tag, populated from the Dagger + * configuration (falling back to {@code FLINK_JOB_ID_DEFAULT} when unset) and rendered into the + * StatsD {@code key=value} form via {@link StatsDTag#getFormattedTag()}. + * + * @return the formatted global tag strings to register as extra tags on the reporter + */ private String[] generateGlobalTags() { StatsDTag[] globalTags = new StatsDTag[]{ new StatsDTag(GlobalTags.JOB_ID, daggerConfiguration.getString(FLINK_JOB_ID_KEY, FLINK_JOB_ID_DEFAULT))}; @@ -30,6 +59,16 @@ private String[] generateGlobalTags() { .toArray(String[]::new); } + /** + * {@inheritDoc} + * + *

On the first call this constructs the shared reporter from a {@link DaggerMetricsConfig} and + * the {@linkplain #generateGlobalTags() global tags}; subsequent calls return the cached static + * instance. The check is not synchronized, so concurrent first calls may briefly race to build + * the reporter. + * + * @return the process-wide depot {@link StatsDReporter} + */ @Override public StatsDReporter buildStatsDReporter() { if (statsDReporter == null) { @@ -44,6 +83,14 @@ public StatsDReporter buildStatsDReporter() { return statsDReporter; } + /** + * Closes the shared reporter and clears the cached instance so it can be rebuilt later. + * + *

Package/subclass-visible hook used to release the underlying StatsD resources, for example + * during job teardown or between tests. Safe to call when no reporter has been built yet. + * + * @throws IOException if the underlying reporter fails to close + */ protected static void close() throws IOException { if (statsDReporter != null) { statsDReporter.close(); @@ -51,7 +98,17 @@ protected static void close() throws IOException { } } + /** + * Factory for {@link DaggerStatsDReporter} instances, exposing the otherwise private constructor. + */ public static class Provider { + /** + * Creates a new supplier for the given configurations. + * + * @param flinkConfiguration Flink configuration providing the StatsD host and port + * @param daggerConfiguration Dagger configuration providing global tag values such as the job id + * @return a new {@link DaggerStatsDReporter} + */ public static DaggerStatsDReporter provide(Configuration flinkConfiguration, com.gotocompany.dagger.common.configuration.Configuration daggerConfiguration) { return new DaggerStatsDReporter(flinkConfiguration, daggerConfiguration); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/SerializedStatsDReporterSupplier.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/SerializedStatsDReporterSupplier.java index df19811c4..37bad2b50 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/SerializedStatsDReporterSupplier.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/SerializedStatsDReporterSupplier.java @@ -11,5 +11,13 @@ @FunctionalInterface public interface SerializedStatsDReporterSupplier extends Serializable { + /** + * Builds a fresh {@link StatsDReporter} instance. + * + *

Invoked lazily on the task managers so that the non-serializable reporter is constructed + * after this serializable supplier has been shipped as part of the Flink job graph. + * + * @return a newly constructed {@code StatsDReporter} + */ StatsDReporter buildStatsDReporter(); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/StatsDErrorReporter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/StatsDErrorReporter.java index 8b6cb67cd..a125c11c2 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/StatsDErrorReporter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/StatsDErrorReporter.java @@ -11,27 +11,80 @@ import static com.gotocompany.dagger.core.utils.Constants.FATAL_EXCEPTION_METRIC_GROUP_KEY; import static com.gotocompany.dagger.core.utils.Constants.NONFATAL_EXCEPTION_METRIC_GROUP_KEY; +/** + * {@link ErrorReporter} implementation that publishes job exceptions to StatsD as counters. + * + *

This reporter is the StatsD-backed error channel for a Dagger Flink job. Whenever the pipeline + * encounters a fatal or non-fatal exception, the corresponding {@code report*} method emits a count + * of {@code 1} on a dedicated metric, tagged with the fully-qualified class name of the offending + * exception so failures can be grouped by type in the metrics backend. Reporting is delegated to a + * depot {@link StatsDReporter} obtained from a {@link SerializedStatsDReporterSupplier}, which keeps + * this class {@link Serializable} so it can be shipped as part of the Flink job graph. + */ public class StatsDErrorReporter implements ErrorReporter, Serializable { + /** Tag key under which a fatal exception's class name is published. */ private static final String FATAL_EXCEPTION_TAG_KEY = "fatal_exception_type"; + /** Tag key under which a non-fatal exception's class name is published. */ private static final String NON_FATAL_EXCEPTION_TAG_KEY = "non_fatal_exception_type"; + /** Shared depot reporter used to emit the StatsD counts; obtained from the supplier. */ private final StatsDReporter statsDReporter; + /** + * Creates a reporter backed by the StatsD reporter produced by the given supplier. + * + * @param statsDReporterSupplier serializable supplier whose {@code buildStatsDReporter()} is + * invoked once to obtain the underlying depot reporter + */ public StatsDErrorReporter(SerializedStatsDReporterSupplier statsDReporterSupplier) { this.statsDReporter = statsDReporterSupplier.buildStatsDReporter(); } + /** + * Records a fatal exception by emitting a StatsD count of {@code 1} on the fatal-exception + * metric. + * + *

The exception's fully-qualified class name is attached as the value of the + * {@code fatal_exception_type} tag, allowing fatal failures to be aggregated by type in the + * metrics backend. + * + * @param exception the fatal exception whose type is reported + */ @Override public void reportFatalException(Exception exception) { StatsDTag statsDTag = new StatsDTag(FATAL_EXCEPTION_TAG_KEY, exception.getClass().getName()); statsDReporter.captureCount(FATAL_EXCEPTION_METRIC_GROUP_KEY, 1L, statsDTag.getFormattedTag()); } + /** + * Records a non-fatal exception by emitting a StatsD count of {@code 1} on the + * non-fatal-exception metric. + * + *

The exception's fully-qualified class name is attached as the value of the + * {@code non_fatal_exception_type} tag, allowing recoverable failures to be aggregated by type + * in the metrics backend. + * + * @param exception the non-fatal exception whose type is reported + */ @Override public void reportNonFatalException(Exception exception) { StatsDTag statsDTag = new StatsDTag(NON_FATAL_EXCEPTION_TAG_KEY, exception.getClass().getName()); statsDReporter.captureCount(NONFATAL_EXCEPTION_METRIC_GROUP_KEY, 1L, statsDTag.getFormattedTag()); } + /** + * Unsupported for the StatsD reporter. + * + *

Unlike the default {@link ErrorReporter} behavior, this implementation does not register + * exceptions against a Flink {@link MetricGroup}; all reporting happens through the depot + * {@link StatsDReporter} instead. Callers should use {@link #reportFatalException(Exception)} or + * {@link #reportNonFatalException(Exception)}. + * + * @param exception the exception that would be counted; ignored + * @param metricGroup the Flink metric group; ignored + * @param metricGroupKey the metric group key; ignored + * @return never returns normally + * @throws UnsupportedOperationException always, because this operation is not supported here + */ @Override public Counter addExceptionToCounter(Exception exception, MetricGroup metricGroup, String metricGroupKey) { throw new UnsupportedOperationException("This operation is not supported on StatsDErrorReporter"); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerCounterManager.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerCounterManager.java index 2cfa8ad1b..d2ca11a7a 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerCounterManager.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerCounterManager.java @@ -8,14 +8,37 @@ import java.util.ArrayList; +/** + * StatsD-backed {@link Counter} that emits count deltas for an aspect under a fixed set of tags. + * + *

After the tags have been supplied via {@link #register(StatsDTag[])}, the increment/decrement + * methods publish positive or negative deltas for an aspect's metric through the shared depot + * {@link StatsDReporter}. It is used throughout Dagger to count events such as processed records, + * errors, and retries. + */ public class DaggerCounterManager implements MeasurementManager, Counter { + /** Shared depot reporter used to publish count deltas. */ private final StatsDReporter statsDReporter; + /** Tags, in StatsD {@code key=value} form, attached to every delta; populated by {@link #register(StatsDTag[])}. */ private String[] formattedTags; + /** + * Creates a counter manager backed by the reporter produced by the given supplier. + * + * @param statsDReporterSupplier serializable supplier of the shared depot StatsD reporter + */ public DaggerCounterManager(SerializedStatsDReporterSupplier statsDReporterSupplier) { this.statsDReporter = statsDReporterSupplier.buildStatsDReporter(); } + /** + * {@inheritDoc} + * + *

Converts each {@link StatsDTag} into its formatted {@code key=value} representation and + * caches the resulting array for use on every subsequent delta. + * + * @param tags the tags to attach to all counter updates + */ @Override public void register(StatsDTag[] tags) { ArrayList tagList = new ArrayList<>(); @@ -25,21 +48,46 @@ public void register(StatsDTag[] tags) { this.formattedTags = tagList.toArray(new String[0]); } + /** + * Increments the aspect's counter by one. + * + * @param aspect the metric aspect to increment + */ @Override public void increment(Aspects aspect) { increment(aspect, 1L); } + /** + * Increments the aspect's counter by the given amount. + * + * @param aspect the metric aspect whose {@code getValue()} supplies the StatsD metric name + * @param positiveCount the (positive) delta to add to the counter + */ @Override public void increment(Aspects aspect, long positiveCount) { statsDReporter.captureCount(aspect.getValue(), positiveCount, formattedTags); } + /** + * Decrements the aspect's counter by one. + * + * @param aspect the metric aspect to decrement + */ @Override public void decrement(Aspects aspect) { decrement(aspect, -1L); } + /** + * Records the given delta against the aspect's counter. + * + *

The value is forwarded to the reporter unchanged, so callers must pass a negative number to + * actually decrease the count. + * + * @param aspect the metric aspect whose {@code getValue()} supplies the StatsD metric name + * @param negativeCount the delta to apply; expected to be negative for a true decrement + */ @Override public void decrement(Aspects aspect, long negativeCount) { statsDReporter.captureCount(aspect.getValue(), negativeCount, formattedTags); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerGaugeManager.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerGaugeManager.java index 357b4f2a8..9f0338662 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerGaugeManager.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerGaugeManager.java @@ -8,14 +8,38 @@ import java.util.ArrayList; +/** + * StatsD-backed {@link Gauge} that reports point-in-time integer values decorated with a fixed set + * of tags. + * + *

Once the tags have been supplied via {@link #register(StatsDTag[])}, each call to + * {@link #markValue(Aspects, int)} publishes the value under the aspect's metric name through the + * shared depot {@link StatsDReporter}. It is used across Dagger to surface current-state signals + * such as watermarks, consumer lag, or open-connection counts. + */ public class DaggerGaugeManager implements MeasurementManager, Gauge { + /** Shared depot reporter used to publish gauge values. */ private final StatsDReporter statsDReporter; + /** Tags, in StatsD {@code key=value} form, attached to every gauge; populated by {@link #register(StatsDTag[])}. */ private String[] formattedTags; + /** + * Creates a gauge manager backed by the reporter produced by the given supplier. + * + * @param statsDReporterSupplier serializable supplier of the shared depot StatsD reporter + */ public DaggerGaugeManager(SerializedStatsDReporterSupplier statsDReporterSupplier) { this.statsDReporter = statsDReporterSupplier.buildStatsDReporter(); } + /** + * {@inheritDoc} + * + *

Converts each {@link StatsDTag} into its formatted {@code key=value} representation and + * caches the resulting array for use on every subsequent gauge emission. + * + * @param tags the tags to attach to all gauge values + */ @Override public void register(StatsDTag[] tags) { ArrayList tagList = new ArrayList<>(); @@ -25,6 +49,12 @@ public void register(StatsDTag[] tags) { this.formattedTags = tagList.toArray(new String[0]); } + /** + * Publishes a gauge reading for the given aspect together with the registered tags. + * + * @param aspect the metric aspect whose {@code getValue()} supplies the StatsD metric name + * @param gaugeValue the current value to report + */ @Override public void markValue(Aspects aspect, int gaugeValue) { statsDReporter.gauge(aspect.getValue(), gaugeValue, formattedTags); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerHistogramManager.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerHistogramManager.java index 1c07dceae..854876b4e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerHistogramManager.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/DaggerHistogramManager.java @@ -8,14 +8,38 @@ import java.util.ArrayList; +/** + * StatsD-backed {@link Histogram} that records distributions of {@code long} samples under a fixed + * set of tags. + * + *

Once the tags have been supplied via {@link #register(StatsDTag[])}, each call to + * {@link #recordValue(Aspects, long)} forwards the sample to the shared depot {@link StatsDReporter} + * through its histogram API. It is commonly used for latency and size distributions within the + * Dagger pipeline. + */ public class DaggerHistogramManager implements MeasurementManager, Histogram { + /** Shared depot reporter used to publish histogram samples. */ private final StatsDReporter statsDReporter; + /** Tags, in StatsD {@code key=value} form, attached to every sample; populated by {@link #register(StatsDTag[])}. */ private String[] formattedTags; + /** + * Creates a histogram manager backed by the reporter produced by the given supplier. + * + * @param statsDReporterSupplier serializable supplier of the shared depot StatsD reporter + */ public DaggerHistogramManager(SerializedStatsDReporterSupplier statsDReporterSupplier) { this.statsDReporter = statsDReporterSupplier.buildStatsDReporter(); } + /** + * {@inheritDoc} + * + *

Converts each {@link StatsDTag} into its formatted {@code key=value} representation and + * caches the resulting array for use on every subsequent sample. + * + * @param tags the tags to attach to all recorded samples + */ @Override public void register(StatsDTag[] tags) { ArrayList tagList = new ArrayList<>(); @@ -25,6 +49,12 @@ public void register(StatsDTag[] tags) { this.formattedTags = tagList.toArray(new String[0]); } + /** + * Records a single histogram sample for the given aspect together with the registered tags. + * + * @param aspect the metric aspect whose {@code getValue()} supplies the StatsD metric name + * @param value the sample value to add to the distribution + */ @Override public void recordValue(Aspects aspect, long value) { statsDReporter.captureHistogram(aspect.getValue(), value, formattedTags); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/MeasurementManager.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/MeasurementManager.java index 609261927..250adb242 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/MeasurementManager.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/manager/MeasurementManager.java @@ -4,6 +4,23 @@ import java.io.Serializable; +/** + * Common contract for StatsD measurement managers that publish a particular kind of metric. + * + *

Implementations such as {@link DaggerCounterManager}, {@link DaggerGaugeManager}, and + * {@link DaggerHistogramManager} wrap the shared depot StatsD reporter together with a fixed set of + * tags. The tags are supplied once via {@link #register(StatsDTag[])}; afterwards every metric the + * manager emits is decorated with them. Extending {@link Serializable} allows managers to be + * embedded in Flink operators and shipped with the job graph. + */ public interface MeasurementManager extends Serializable { + /** + * Registers the tags that will be attached to every metric this manager subsequently emits. + * + *

Typically called once during operator setup with the component- and instance-specific tags; + * implementations convert them into StatsD {@code key=value} strings and retain them for reuse. + * + * @param tags the tags to attach to all measurements; must not be {@code null} + */ void register(StatsDTag[] tags); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Counter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Counter.java index f85695be1..218dd55a8 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Counter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Counter.java @@ -4,12 +4,44 @@ import java.io.Serializable; +/** + * Abstraction for emitting counter metrics for a Dagger {@link Aspects} aspect. + * + *

Counters accumulate deltas over time and are typically used to count events such as records + * processed, errors, or retries. Implementations may increase or decrease the count by one or by an + * explicit amount. The interface extends {@link Serializable} so it can travel with the Flink job + * graph. + * + * @see com.gotocompany.dagger.core.metrics.reporters.statsd.manager.DaggerCounterManager + */ public interface Counter extends Serializable { + /** + * Increases the aspect's counter by one. + * + * @param aspect the metric aspect to increment + */ void increment(Aspects aspect); + /** + * Increases the aspect's counter by the given amount. + * + * @param aspect the metric aspect to increment + * @param num the delta to add to the counter + */ void increment(Aspects aspect, long num); + /** + * Decreases the aspect's counter by one. + * + * @param aspect the metric aspect to decrement + */ void decrement(Aspects aspect); + /** + * Decreases the aspect's counter by the given amount. + * + * @param aspect the metric aspect to decrement + * @param num the delta to apply to the counter + */ void decrement(Aspects aspect, long num); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Gauge.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Gauge.java index b9078d20b..07cd5b3dc 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Gauge.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Gauge.java @@ -4,6 +4,21 @@ import java.io.Serializable; +/** + * Abstraction for emitting gauge metrics (point-in-time values) for a Dagger {@link Aspects} aspect. + * + *

A gauge reports the latest value of some quantity rather than a cumulative total, for example a + * current lag, queue depth, or watermark. The interface extends {@link Serializable} so it can be + * embedded in Flink operators shipped with the job graph. + * + * @see com.gotocompany.dagger.core.metrics.reporters.statsd.manager.DaggerGaugeManager + */ public interface Gauge extends Serializable { + /** + * Reports the current value of the given aspect's gauge. + * + * @param aspect the metric aspect identifying which gauge to update + * @param gaugeValue the current value to report + */ void markValue(Aspects aspect, int gaugeValue); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Histogram.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Histogram.java index d4d3d101a..ede64b470 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Histogram.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/measurement/Histogram.java @@ -4,6 +4,21 @@ import java.io.Serializable; +/** + * Abstraction for emitting histogram (distribution) metrics for a Dagger {@link Aspects} aspect. + * + *

Implementations record individual samples that the metrics backend aggregates into a + * distribution (percentiles, min/max, and so on). The interface extends {@link Serializable} so it + * can be carried inside Flink operators as part of the job graph. + * + * @see com.gotocompany.dagger.core.metrics.reporters.statsd.manager.DaggerHistogramManager + */ public interface Histogram extends Serializable { + /** + * Records a single sample for the given aspect. + * + * @param aspect the metric aspect identifying which histogram to update + * @param value the sample value to add to the distribution + */ void recordValue(Aspects aspect, long value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/ComponentTags.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/ComponentTags.java index 7c47a42e5..9ac0ab047 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/ComponentTags.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/ComponentTags.java @@ -1,10 +1,29 @@ package com.gotocompany.dagger.core.metrics.reporters.statsd.tags; +/** + * Provides shared, lazily-initialized {@link StatsDTag} arrays that identify individual Dagger + * pipeline components. + * + *

These component tags (all keyed by {@code component}) are attached to metrics emitted by the + * corresponding subsystems so dashboards can attribute measurements to a specific component, such as + * the Parquet file reader or the source split assigner. Each array is cached after first use and + * reused on subsequent calls. + */ public class ComponentTags { + /** Cached tags marking the Parquet reader component; built on first access. */ private static StatsDTag[] parquetReaderTags; + /** Cached tags marking the split-assigner component; built on first access. */ private static StatsDTag[] splitAssignerTags; + /** Tag key shared by all component tags. */ private static final String COMPONENT_TAG_KEY = "component"; + /** + * Returns the tags identifying the Parquet reader component. + * + *

The array is created on the first call and cached for reuse. + * + * @return a single-element array tagging metrics with {@code component=parquet_reader} + */ public static StatsDTag[] getParquetReaderTags() { if (parquetReaderTags == null) { parquetReaderTags = new StatsDTag[]{ @@ -14,6 +33,13 @@ public static StatsDTag[] getParquetReaderTags() { return parquetReaderTags; } + /** + * Returns the tags identifying the source split-assigner component. + * + *

The array is created on the first call and cached for reuse. + * + * @return a single-element array tagging metrics with {@code component=split_assigner} + */ public static StatsDTag[] getSplitAssignerTags() { if (splitAssignerTags == null) { splitAssignerTags = new StatsDTag[]{ diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/GlobalTags.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/GlobalTags.java index 40b93f210..91f7a2f07 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/GlobalTags.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/GlobalTags.java @@ -1,5 +1,13 @@ package com.gotocompany.dagger.core.metrics.reporters.statsd.tags; +/** + * Holder for global tag keys that are applied to every metric emitted by a Dagger job. + * + *

Unlike component-specific tags, these dimensions are attached to all measurements regardless of + * which subsystem produces them. They are registered as extra tags when the shared StatsD reporter + * is built in {@code DaggerStatsDReporter}. + */ public class GlobalTags { + /** Tag key carrying the Flink job id, used to distinguish metrics across jobs. */ public static final String JOB_ID = "job_id"; } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/StatsDTag.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/StatsDTag.java index b0394a353..2846a2b1a 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/StatsDTag.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/reporters/statsd/tags/StatsDTag.java @@ -2,21 +2,54 @@ import org.apache.flink.util.Preconditions; +/** + * Immutable representation of a single StatsD tag, either a bare {@code key} or a {@code key=value} + * pair. + * + *

Dagger attaches these tags to every metric it publishes so measurements can be sliced by + * dimensions such as component, job id, or exception type. The tag key is mandatory; a missing or + * empty value is normalized to an internal sentinel so the tag renders as a bare key. Call + * {@link #getFormattedTag()} to obtain the StatsD wire representation. + */ public class StatsDTag { + /** The tag key; guaranteed non-null and non-empty. */ private final String tagKey; + /** The tag value, or {@link #NIL_TAG_VALUE} when no meaningful value was supplied. */ private final String tagValue; + /** Sentinel marking the absence of a value, which causes the tag to format as a bare key. */ private static final String NIL_TAG_VALUE = "NIL_TAG_VALUE"; + /** + * Creates a tag with the given key and value. + * + * @param key the tag key; must be non-null and non-empty + * @param value the tag value; a {@code null} or empty value is treated as "no value", causing + * {@link #getFormattedTag()} to emit just the key + * @throws IllegalArgumentException if {@code key} is {@code null} or empty + */ public StatsDTag(String key, String value) { Preconditions.checkArgument(key != null && !key.isEmpty(), "Tag key cannot be null or empty"); this.tagKey = key; this.tagValue = (value != null && !value.isEmpty()) ? value : NIL_TAG_VALUE; } + /** + * Creates a value-less tag consisting of only a key. + * + *

The resulting tag is rendered as the bare key by {@link #getFormattedTag()}. + * + * @param tagName the tag key; must be non-null and non-empty + * @throws IllegalArgumentException if {@code tagName} is {@code null} or empty + */ public StatsDTag(String tagName) { this(tagName, NIL_TAG_VALUE); } + /** + * Renders this tag in StatsD wire format. + * + * @return {@code key=value} when a value is present, or just {@code key} for a value-less tag + */ public String getFormattedTag() { if (tagValue.equals(NIL_TAG_VALUE)) { return tagKey; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryPublisher.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryPublisher.java index 512902f4e..c38bfff03 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryPublisher.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryPublisher.java @@ -8,6 +8,12 @@ * The interface Telemetry publisher. */ public interface TelemetryPublisher { + /** + * Shared registry of {@link TelemetrySubscriber} instances notified when telemetry changes. + * + *

Being declared in an interface, this collection is implicitly {@code static}, so all + * publishers share the same subscriber list. + */ List TELEMETRY_SUBSCRIBERS = new ArrayList<>(); /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryTypes.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryTypes.java index 6e68c7394..32c716a9f 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryTypes.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/metrics/telemetry/TelemetryTypes.java @@ -4,15 +4,45 @@ * The enum Telemetry types. */ public enum TelemetryTypes { + /** + * Telemetry tag identifying the Kafka input topic. + */ INPUT_TOPIC("input_topic"), + /** + * Telemetry tag identifying the input Protobuf message class. + */ INPUT_PROTO("input_proto"), + /** + * Telemetry tag identifying the input stream name. + */ INPUT_STREAM("input_stream"), + /** + * Telemetry tag identifying the configured sink type. + */ SINK_TYPE("sink_type"), + /** + * Telemetry tag identifying the Kafka output topic. + */ OUTPUT_TOPIC("output_topic"), + /** + * Telemetry tag identifying the output Protobuf message class. + */ OUTPUT_PROTO("output_proto"), + /** + * Telemetry tag identifying the output stream name. + */ OUTPUT_STREAM("output_stream"), + /** + * Telemetry tag identifying the post-processor type in use. + */ POST_PROCESSOR_TYPE("post_processor_type"), + /** + * Telemetry tag identifying the pre-processor type in use. + */ PRE_PROCESSOR_TYPE("pre_processor_type"), + /** + * Telemetry tag identifying the source metric id. + */ SOURCE_METRIC_ID("source_metricId"); /** @@ -24,8 +54,16 @@ public String getValue() { return value; } + /** + * The string tag reported for this telemetry type. + */ private String value; + /** + * Instantiates a new telemetry type. + * + * @param value the string tag for this telemetry type + */ TelemetryTypes(String value) { this.value = value; } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ColumnNameManager.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ColumnNameManager.java index d313a6389..30e33711c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ColumnNameManager.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ColumnNameManager.java @@ -10,7 +10,13 @@ * Manage input and output column names. */ public class ColumnNameManager implements Serializable { + /** + * The fixed, ordered list of input column names provided at construction time. + */ private final List inputColumnNames; + /** + * The ordered list of output column names, with the select-all sentinel expanded to the inputs. + */ private List outputColumnNames; /** @@ -71,6 +77,16 @@ public String[] getInputColumnNames() { return inputColumnNames.toArray(new String[0]); } + /** + * Resolves the effective output column names for this manager. + * + *

When the configured names request selecting all input columns (via the + * {@code SELECT *} sentinel value), that sentinel is removed and replaced by the full set of + * input column names so downstream consumers always observe concrete column names. + * + * @param names the configured output column names; may contain the select-all sentinel value + * @return the resolved list of output column names + */ private List setOutputColumnNames(List names) { if (selectAllFromInputColumns(names)) { names.remove(Constants.SQL_PATH_SELECT_ALL_CONFIG_VALUE); @@ -79,6 +95,13 @@ private List setOutputColumnNames(List names) { return names; } + /** + * Checks whether the configured output columns request selecting all input columns. + * + * @param names the configured output column names; may be {@code null} + * @return {@code true} if {@code names} is non-null and contains the select-all sentinel value, + * {@code false} otherwise + */ private boolean selectAllFromInputColumns(List names) { return names != null && names.contains(Constants.SQL_PATH_SELECT_ALL_CONFIG_VALUE); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ParentPostProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ParentPostProcessor.java index 3b834063a..332dd419d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ParentPostProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/ParentPostProcessor.java @@ -26,11 +26,23 @@ * The Parent post processor. */ public class ParentPostProcessor implements PostProcessor { + /** + * The parsed post processor configuration describing the external, internal and transform stages. + */ private final PostProcessorConfig postProcessorConfig; + /** + * The orchestrator used to obtain Stencil clients for resolving Protobuf descriptors. + */ private final StencilClientOrchestrator stencilClientOrchestrator; + /** + * The subscriber notified about telemetry emitted by the underlying post processors. + */ private TelemetrySubscriber telemetrySubscriber; + /** + * The Dagger context exposing the job {@link Configuration} and shared runtime wiring. + */ private final DaggerContext daggerContext; /** @@ -47,11 +59,33 @@ public ParentPostProcessor(DaggerContext daggerContext, StencilClientOrchestrato this.postProcessorConfig = parsePostProcessorConfig(daggerContext.getConfiguration()); } + /** + * Parses the post processor configuration from the job configuration. + * + *

Reads the raw JSON string stored under + * {@code Constants.PROCESSOR_POSTPROCESSOR_CONFIG_KEY} (defaulting to an empty string when + * absent) and converts it into a {@link PostProcessorConfig}. + * + * @param configuration the job configuration to read the raw config string from + * @return the parsed post processor configuration + */ private static PostProcessorConfig parsePostProcessorConfig(Configuration configuration) { String postProcessorConfigString = configuration.getString(Constants.PROCESSOR_POSTPROCESSOR_CONFIG_KEY, ""); return PostProcessorConfig.parse(postProcessorConfigString); } + /** + * {@inheritDoc} + * + *

Applies the configured post processing pipeline to the given stream. When no post + * processing is configured the input is returned unchanged. Otherwise the stream is first + * initialized with an output {@link Row}, passed through the enabled external and internal + * post processors, projected onto the configured output columns, and finally run through any + * configured SQL transformers. + * + * @param streamInfo the incoming stream together with its column names + * @return the resulting stream after all applicable post processing stages have run + */ @Override public StreamInfo process(StreamInfo streamInfo) { if (!canProcess(postProcessorConfig)) { @@ -81,11 +115,31 @@ public StreamInfo process(StreamInfo streamInfo) { return resultantStreamInfo; } + /** + * {@inheritDoc} + * + *

Determines whether this parent processor has any post processing work to perform. + * + * @param config the post processor configuration to evaluate + * @return {@code true} when {@code config} is non-null and not empty, {@code false} otherwise + */ @Override public boolean canProcess(PostProcessorConfig config) { return config != null && !config.isEmpty(); } + /** + * Builds the list of post processors that are enabled and applicable for the current config. + * + *

Returns an empty list when post processing is disabled via + * {@code Constants.PROCESSOR_POSTPROCESSOR_ENABLE_KEY}. Otherwise it assembles the external and + * internal post processors and keeps only those whose {@code canProcess} check passes for the + * parsed configuration. + * + * @param subscriber the telemetry subscriber forwarded to the external metric configuration + * @param schemaConfig the schema configuration shared across the post processors + * @return the post processors that are enabled and able to process the current configuration + */ private List getEnabledPostProcessors(TelemetrySubscriber subscriber, SchemaConfig schemaConfig) { if (!daggerContext.getConfiguration().getBoolean(Constants.PROCESSOR_POSTPROCESSOR_ENABLE_KEY, Constants.PROCESSOR_POSTPROCESSOR_ENABLE_DEFAULT)) { return new ArrayList<>(); @@ -101,6 +155,13 @@ private List getEnabledPostProcessors(TelemetrySubscriber subscri .collect(Collectors.toList()); } + /** + * Creates the external metric configuration used to instrument the external post processors. + * + * @param config the job configuration carrying external metric settings + * @param subscriber the telemetry subscriber that receives external telemetry + * @return a new {@link ExternalMetricConfig} bound to the given config and subscriber + */ private ExternalMetricConfig getExternalMetricConfig(Configuration config, TelemetrySubscriber subscriber) { return new ExternalMetricConfig(config, subscriber); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorConfig.java index c1328b35f..7e358fa70 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorConfig.java @@ -21,9 +21,21 @@ */ public class PostProcessorConfig implements Serializable { + /** + * Configuration for external post processor sources such as HTTP, GRPC, Elasticsearch and Postgres. + */ private ExternalSourceConfig externalSource; + /** + * Configuration for the transformers applied to the stream during post processing. + */ private List transformers; + /** + * Configuration for internal post processor sources that derive output fields from existing data. + */ private List internalSource; + /** + * Shared Gson instance configured to map snake_case JSON keys onto the config fields. + */ private static final Gson GSON = new GsonBuilder().setFieldNamingPolicy(FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES).create(); /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorFactory.java index 150189fc5..17e766e20 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PostProcessorFactory.java @@ -44,6 +44,18 @@ public static List getPostProcessors(DaggerContext daggerContext, return postProcessors; } + /** + * Builds the Longbow post processor used for feature and range lookups backed by BigTable. + * + *

Creates a {@link LongbowSchema} from the supplied column names and delegates to a + * {@link LongbowFactory} to construct the appropriate Longbow processor implementation. + * + * @param columnNames the output column names used to derive the Longbow schema + * @param configuration the job configuration controlling Longbow behavior + * @param metricsTelemetryExporter the exporter that publishes Longbow telemetry + * @param stencilClientOrchestrator the orchestrator used to resolve Protobuf descriptors + * @return the Longbow post processor instance produced by the factory + */ private static PostProcessor getLongBowProcessor(String[] columnNames, Configuration configuration, MetricsTelemetryExporter metricsTelemetryExporter, StencilClientOrchestrator stencilClientOrchestrator) { final LongbowSchema longbowSchema = new LongbowSchema(columnNames); LongbowFactory longbowFactory = new LongbowFactory(longbowSchema, configuration, stencilClientOrchestrator, metricsTelemetryExporter); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PreProcessorOrchestrator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PreProcessorOrchestrator.java index 56b12b49f..a76a91b50 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PreProcessorOrchestrator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/PreProcessorOrchestrator.java @@ -23,10 +23,25 @@ */ public class PreProcessorOrchestrator implements Preprocessor { + /** + * The exporter that publishes preprocessor telemetry to the metrics subscriber. + */ private final MetricsTelemetryExporter metricsTelemetryExporter; + /** + * The parsed preprocessor configuration, or {@code null} when preprocessing is disabled. + */ private final PreProcessorConfig processorConfig; + /** + * The name of the table whose transformers this orchestrator applies. + */ private final String tableName; + /** + * The Dagger context exposing the job {@link Configuration} and shared runtime wiring. + */ private final DaggerContext daggerContext; + /** + * Shared Gson instance configured to map snake_case JSON keys onto the config fields. + */ private static final Gson GSON = new GsonBuilder().setFieldNamingPolicy(FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES).create(); /** @@ -63,6 +78,15 @@ public PreProcessorConfig parseConfig(Configuration configuration) { return config; } + /** + * {@inheritDoc} + * + *

Runs each enabled preprocessor over the stream in turn and then wraps the result in a + * {@link ValidRecordsDecorator} so that only valid records are forwarded downstream. + * + * @param streamInfo the incoming stream together with its column names + * @return the resulting stream after all preprocessors and the valid-records filter have run + */ @Override public StreamInfo process(StreamInfo streamInfo) { for (Preprocessor processor : getProcessors()) { @@ -99,6 +123,14 @@ protected List getProcessors() { return preprocessors; } + /** + * {@inheritDoc} + * + *

Determines whether this orchestrator has any preprocessing work to perform. + * + * @param config the preprocessor configuration to evaluate + * @return {@code true} when {@code config} is non-null and not empty, {@code false} otherwise + */ @Override public boolean canProcess(PreProcessorConfig config) { return config != null && !config.isEmpty(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/DescriptorManager.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/DescriptorManager.java index 3325bdc07..6ed4b50b5 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/DescriptorManager.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/DescriptorManager.java @@ -12,6 +12,9 @@ * The Descriptor manager. */ public class DescriptorManager implements Serializable { + /** + * The Stencil client used to resolve Protobuf descriptors by proto class name. + */ private StencilClient stencilClient; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/EndpointHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/EndpointHandler.java index 68b308576..035a100fb 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/EndpointHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/EndpointHandler.java @@ -28,13 +28,37 @@ * The Endpoint handler. */ public class EndpointHandler { + /** + * Logger used to record diagnostic information during endpoint variable resolution. + */ private static final Logger LOGGER = LoggerFactory.getLogger(EndpointHandler.class.getName()); + /** + * Meter stats manager used to record external source metrics such as empty inputs. + */ private MeterStatsManager meterStatsManager; + /** + * Reporter used to surface fatal errors encountered while resolving descriptors. + */ private ErrorReporter errorReporter; + /** + * The fully qualified names of the input Protobuf classes used to locate field descriptors. + */ private String[] inputProtoClasses; + /** + * Lazily built mapping from input column name to its Protobuf {@link Descriptors.FieldDescriptor}. + */ private Map descriptorMap; + /** + * Manager that resolves input and output column indices by name. + */ private ColumnNameManager columnNameManager; + /** + * Manager that resolves Protobuf descriptors by proto class name. + */ private DescriptorManager descriptorManager; + /** + * The most recently resolved input Protobuf message descriptor. + */ private Descriptors.Descriptor descriptor; /** @@ -117,6 +141,18 @@ public boolean isQueryInvalid(ResultFuture resultFuture, RowManager rowMana return false; } + /** + * Builds a lookup from required input column names to their Protobuf field descriptors. + * + *

For each required column the method scans every configured input proto class and records + * the first matching {@link Descriptors.FieldDescriptor} found. Columns without a matching field + * are simply omitted from the returned map. + * + * @param requiredInputColumns the input column names that need descriptors + * @param inputProtoClassNames the proto class names to search for matching fields + * @param resultFuture the result future completed exceptionally if a descriptor is missing + * @return a map from column name to its matching field descriptor + */ private Map createDescriptorMap(String[] requiredInputColumns, String[] inputProtoClassNames, ResultFuture resultFuture) { @@ -134,6 +170,16 @@ private Map createDescriptorMap(String[] re return descriptorHashMap; } + /** + * Resolves the Protobuf message descriptor for the given proto class name. + * + *

If the descriptor cannot be found the underlying error is reported and propagated through + * the supplied result future. + * + * @param resultFuture the result future completed exceptionally when the descriptor is missing + * @param protoClassName the fully qualified proto class name to resolve + * @return the resolved message descriptor, or the previously held descriptor if resolution failed + */ private Descriptors.Descriptor getInputDescriptor(ResultFuture resultFuture, String protoClassName) { try { descriptor = descriptorManager.getDescriptor(protoClassName); @@ -143,6 +189,12 @@ private Descriptors.Descriptor getInputDescriptor(ResultFuture resultFuture return descriptor; } + /** + * Reports the given exception as fatal and completes the result future exceptionally. + * + * @param resultFuture the result future to complete exceptionally + * @param exception the exception to report and propagate + */ private void reportAndThrowError(ResultFuture resultFuture, Exception exception) { errorReporter.reportFatalException(exception); resultFuture.completeExceptionally(exception); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/FetchOutputDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/FetchOutputDecorator.java index 281df539f..57801bd84 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/FetchOutputDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/FetchOutputDecorator.java @@ -22,9 +22,21 @@ public class FetchOutputDecorator implements MapDecorator { + /** + * The names of the output columns produced by the post processing pipeline. + */ private String[] outputColumnNames; + /** + * The orchestrator used to obtain the Stencil client for resolving the output descriptor. + */ private StencilClientOrchestrator stencilClientOrchestrator; + /** + * The fully qualified name of the output Protobuf class used to derive output column types. + */ private String outputProtoClassName; + /** + * Whether a SQL transformer follows this stage, requiring typed output and timestamp conversion. + */ private boolean hasSQLTransformer; /** @@ -40,11 +52,29 @@ public FetchOutputDecorator(SchemaConfig schemaConfig, boolean hasSQLTransformer this.hasSQLTransformer = hasSQLTransformer; } + /** + * {@inheritDoc} + * + *

This decorator is applied explicitly via {@link #decorate(DataStream)} rather than through + * the generic decoration chain, so it never opts into automatic decoration. + * + * @return {@code false} always + */ @Override public Boolean canDecorate() { return false; } + /** + * {@inheritDoc} + * + *

Extracts the output {@link Row} from the wrapped input record. When a SQL transformer + * follows, any {@code ROWTIME} column is converted from {@link LocalDateTime} to a SQL + * {@link Timestamp} so the value matches the expected Flink type. + * + * @param input the combined input/output record produced upstream + * @return the extracted output row, with timestamp conversion applied when required + */ @Override public Row map(Row input) { RowManager rowManager = new RowManager(input); @@ -52,11 +82,29 @@ public Row map(Row input) { } + /** + * {@inheritDoc} + * + *

Maps the input stream through this decorator. When a SQL transformer follows, the mapped + * stream is annotated with explicit {@link TypeInformation} derived from the output descriptor. + * + * @param inputStream the stream of combined input/output records + * @return the stream of extracted output rows + */ @Override public DataStream decorate(DataStream inputStream) { return hasSQLTransformer ? inputStream.map(this).returns(getTypeInformation()) : inputStream.map(this); } + /** + * Derives the Flink row type information for the output columns. + * + *

Each output column is typed from its matching Protobuf field descriptor when available; the + * {@code ROWTIME} column maps to a SQL timestamp, and any unresolved column falls back to a + * generic {@link Object} type. + * + * @return the {@link RowTypeInfo} describing the output columns + */ private TypeInformation getTypeInformation() { TypeInformation[] typeInformations = new TypeInformation[outputColumnNames.length]; Arrays.fill(typeInformations, TypeInformation.of(Object.class)); @@ -73,10 +121,25 @@ private TypeInformation getTypeInformation() { return new RowTypeInfo(typeInformations, outputColumnNames); } + /** + * Resolves the Protobuf descriptor for the configured output proto class. + * + * @return the output message descriptor, or {@code null} when it cannot be resolved + */ private Descriptors.Descriptor getDescriptor() { return stencilClientOrchestrator.getStencilClient().get(outputProtoClassName); } + /** + * Copies the given row, converting the {@code ROWTIME} column to a SQL {@link Timestamp}. + * + *

All fields are copied verbatim except the {@code ROWTIME} column, whose + * {@link LocalDateTime} value (when present) is converted to a {@link Timestamp} for SQL + * compatibility. + * + * @param row the row whose fields should be copied and timestamp-converted + * @return a new row with the timestamp column converted + */ private Row convertLocalDateTime(Row row) { Row outputRow = new Row(row.getArity()); for (int index = 0; index < outputColumnNames.length; index++) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/InitializationDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/InitializationDecorator.java index 173dcf46c..d65c8ede8 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/InitializationDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/InitializationDecorator.java @@ -10,6 +10,9 @@ */ public class InitializationDecorator implements MapDecorator { + /** + * Manager that supplies the output column count used to size the combined output row. + */ private ColumnNameManager columnNameManager; /** @@ -21,11 +24,28 @@ public InitializationDecorator(ColumnNameManager columnNameManager) { this.columnNameManager = columnNameManager; } + /** + * {@inheritDoc} + * + *

This decorator is always applied explicitly to the stream and therefore does not opt into + * the generic decoration chain. + * + * @return {@code false} always + */ @Override public Boolean canDecorate() { return false; } + /** + * {@inheritDoc} + * + *

Wraps the input {@link Row} into a combined record sized to hold both the input and the + * configured number of output columns, ready for the downstream post processors to populate. + * + * @param input the original input row + * @return the initialized combined input/output row + */ @Override public Row map(Row input) { RowManager rowManager = new RowManager(input, columnNameManager.getOutputSize()); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/OutputMapping.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/OutputMapping.java index dd6a9d3aa..d67830e1e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/OutputMapping.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/OutputMapping.java @@ -11,6 +11,9 @@ */ public class OutputMapping implements Serializable, Validator { + /** + * The JSON path expression that selects the value to extract from the external response. + */ private String path; /** @@ -31,12 +34,28 @@ public String getPath() { return path; } + /** + * Returns the fields that must be present for this mapping to be considered valid. + * + *

The output mapping requires a non-null {@code path}, returned keyed by its field name so + * the validator can ensure it is configured. + * + * @return a map of mandatory field names to their configured values + */ public HashMap getMandatoryFields() { HashMap mandatoryFields = new HashMap<>(); mandatoryFields.put("path", path); return mandatoryFields; } + /** + * {@inheritDoc} + * + *

Two output mappings are equal when they refer to the same {@code path}. + * + * @param o the object to compare with + * @return {@code true} if {@code o} is an output mapping with an equal path, {@code false} otherwise + */ @Override public boolean equals(Object o) { if (this == o) { @@ -49,6 +68,11 @@ public boolean equals(Object o) { return Objects.equals(path, that.path); } + /** + * {@inheritDoc} + * + * @return a hash code derived from the {@code path} + */ @Override public int hashCode() { return Objects.hash(path); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/PostResponseTelemetry.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/PostResponseTelemetry.java index 3fe04b010..a955a546e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/PostResponseTelemetry.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/PostResponseTelemetry.java @@ -63,9 +63,21 @@ public void validateResponseCode(MeterStatsManager meterStatsManager, int status } } + /** + * Checks whether the given HTTP status code falls within the client-error (4xx) range. + * + * @param statusCode the HTTP status code returned by the external call + * @return {@code true} if the code is a client error, {@code false} otherwise + */ private boolean isClientError(int statusCode) { return statusCode >= CLIENT_ERROR_MIN_STATUS_CODE && statusCode <= CLIENT_ERROR_MAX_STATUS_CODE; } + /** + * Checks whether the given HTTP status code falls within the server-error (5xx) range. + * + * @param statusCode the HTTP status code returned by the external call + * @return {@code true} if the code is a server error, {@code false} otherwise + */ private boolean isServerError(int statusCode) { return statusCode >= SERVER_ERROR_MIN_STATUS_CODE && statusCode <= SERVER_ERROR_MAX_STATUS_CODE; } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/RowManager.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/RowManager.java index 3a164b018..eef1c5e8b 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/RowManager.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/RowManager.java @@ -10,8 +10,17 @@ * A class that responsible for managing input and output Row. */ public class RowManager { + /** + * Index of the input child row within the parent input/output row. + */ public static final int INPUT_ROW_INDEX = 0; + /** + * Index of the output child row within the parent input/output row. + */ public static final int OUTPUT_ROW_INDEX = 1; + /** + * The parent row holding the input row and output row as its two children. + */ private Row parentRow; /** @@ -57,6 +66,13 @@ public Object getFromInput(int fieldIndex) { return getChildRow(INPUT_ROW_INDEX).getField(fieldIndex); } + /** + * Returns the input or output child row stored within the parent row. + * + * @param index the child index, either {@link #INPUT_ROW_INDEX} or {@link #OUTPUT_ROW_INDEX} + * @return the child row at the given index + * @throws InputOutputMappingException if the parent row does not have the expected arity of two + */ private Row getChildRow(int index) { if (parentRow.getArity() != 2) { throw new InputOutputMappingException("InputOutputRow does not contain output. Something went wrong. Row Arity: " + parentRow.getArity()); @@ -91,6 +107,15 @@ public Row getOutputData() { return getChildRow(OUTPUT_ROW_INDEX); } + /** + * {@inheritDoc} + * + *

Two row managers are equal when they wrap equal parent rows. + * + * @param o the object to compare with + * @return {@code true} if {@code o} is a row manager wrapping an equal parent row, + * {@code false} otherwise + */ @Override public boolean equals(Object o) { if (this == o) { @@ -103,6 +128,11 @@ public boolean equals(Object o) { return Objects.equals(parentRow, that.parentRow); } + /** + * {@inheritDoc} + * + * @return a hash code derived from the wrapped parent row + */ @Override public int hashCode() { return Objects.hash(parentRow); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/SchemaConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/SchemaConfig.java index eac3558de..6ef3de73c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/SchemaConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/SchemaConfig.java @@ -17,11 +17,29 @@ * The Schema config. */ public class SchemaConfig implements Serializable { + /** + * The job configuration providing access to stream and sink settings. + */ private final Configuration configuration; + /** + * The orchestrator used to obtain Stencil clients for resolving Protobuf descriptors. + */ private final StencilClientOrchestrator stencilClientOrchestrator; + /** + * Manager that resolves input and output column indices by name. + */ private ColumnNameManager columnNameManager; + /** + * The fully qualified names of the input Protobuf message classes for each configured stream. + */ private String[] inputProtoClasses; + /** + * The fully qualified name of the output Protobuf message class written to the sink. + */ private String outputProtoClassName; + /** + * Shared Gson instance used to parse the JSON input streams configuration. + */ private static final Gson GSON = new Gson(); /** @@ -75,6 +93,14 @@ public String getOutputProtoClassName() { return outputProtoClassName; } + /** + * Extracts the input Protobuf message class names from the configured input streams. + * + *

Parses the {@code INPUT_STREAMS} JSON configuration and collects the + * {@code STREAM_INPUT_SCHEMA_PROTO_CLASS} entry from each stream definition. + * + * @return the array of input proto class names, one per configured stream + */ private String[] getMessageProtoClasses() { String jsonArrayString = configuration.getString(INPUT_STREAMS, ""); Map[] streamsConfig = GSON.fromJson(jsonArrayString, Map[].class); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/ValidRecordsDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/ValidRecordsDecorator.java index 7a86fbeab..d8fe914b2 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/ValidRecordsDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/common/ValidRecordsDecorator.java @@ -17,8 +17,17 @@ */ public class ValidRecordsDecorator extends RichFilterFunction implements FilterDecorator { + /** + * The name of the table whose records are being validated, used in error messages. + */ private final String tableName; + /** + * The index of the internal validation field within each row. + */ private final int validationIndex; + /** + * The job configuration used to construct the error reporter. + */ private final Configuration configuration; /** * The Error reporter. @@ -38,16 +47,41 @@ public ValidRecordsDecorator(String tableName, String[] columns, Configuration c this.configuration = configuration; } + /** + * {@inheritDoc} + * + *

Initializes the error reporter from the Flink runtime metric group when the function is + * opened on a task manager. + * + * @param internalFlinkConfig the Flink configuration supplied when the function is opened + * @throws Exception if initialization fails + */ @Override public void open(org.apache.flink.configuration.Configuration internalFlinkConfig) throws Exception { errorReporter = ErrorReporterFactory.getErrorReporter(getRuntimeContext().getMetricGroup(), this.configuration); } + /** + * {@inheritDoc} + * + * @return {@code true} always, since invalid records must always be filtered out + */ @Override public Boolean canDecorate() { return true; } + /** + * {@inheritDoc} + * + *

Keeps only valid records. When a record's internal validation flag is {@code false} the + * failure is reported and an exception is thrown to fail the job, preventing bad records from + * propagating downstream. + * + * @param value the record to validate + * @return {@code true} when the record is valid + * @throws Exception if the record is invalid, wrapping an {@code InvalidProtocolBufferException} + */ @Override public boolean filter(Row value) throws Exception { if (!(boolean) value.getField(validationIndex)) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/AsyncConnector.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/AsyncConnector.java index bbead41d8..a4363c0d3 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/AsyncConnector.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/AsyncConnector.java @@ -35,15 +35,46 @@ * The Async connector. */ public abstract class AsyncConnector extends RichAsyncFunction implements TelemetryPublisher { + /** + * The identifier of the external source type (for example {@code ES}, {@code HTTP}, {@code GRPC}, or {@code PG}), + * used when registering metrics and telemetry for this connector. + */ private final String sourceType; + /** + * The source-specific configuration describing the endpoint, request pattern, and behaviour of this connector. + */ private final SourceConfig sourceConfig; + /** + * The metric configuration controlling telemetry, the metric id, and the shutdown period for this connector. + */ private final ExternalMetricConfig externalMetricConfig; + /** + * The schema configuration providing column metadata, input/output proto classes, and the stencil orchestrator. + */ private final SchemaConfig schemaConfig; + /** + * Reporter used to surface fatal and non-fatal errors raised while making external calls. + */ private ErrorReporter errorReporter; + /** + * Manager used to register and emit meter-style metrics for the external source aspects. + */ private MeterStatsManager meterStatsManager; + /** + * Resolver used to look up protobuf {@link Descriptors.Descriptor}s for request and response messages. + */ private DescriptorManager descriptorManager; + /** + * Telemetry collected for this connector, keyed by telemetry type, with each key mapping to a list of values. + */ private Map> metrics = new HashMap<>(); + /** + * The protobuf descriptor of the output message that enriched values are written into. + */ private Descriptors.Descriptor outputDescriptor; + /** + * Helper that resolves endpoint/request variable values from the incoming {@link Row}. + */ private EndpointHandler endpointHandler; /** @@ -143,6 +174,17 @@ protected DescriptorManager initDescriptorManager(SchemaConfig config) { return new DescriptorManager(config.getStencilClientOrchestrator()); } + /** + * {@inheritDoc} + * + *

Initializes the connector by lazily creating the {@link DescriptorManager}, the external client + * (via {@link #createClient()}), the {@link ErrorReporter}, the {@link MeterStatsManager}, and the + * {@link EndpointHandler}, and then registers the external source metrics under the configured source + * type and metric id. + * + * @param configuration the Flink runtime configuration supplied during operator initialization + * @throws Exception if the parent initialization or client creation fails + */ @Override public void open(Configuration configuration) throws Exception { super.open(configuration); @@ -184,6 +226,17 @@ public void open(Configuration configuration) throws Exception { */ protected abstract void process(Row input, ResultFuture resultFuture) throws Exception; + /** + * {@inheritDoc} + * + *

Delegates to {@link #process(Row, ResultFuture)} and records a successful external call. Pattern or + * variable configuration problems are translated into an {@code InvalidConfigurationException} that is + * reported and propagated through the result future. + * + * @param input the incoming row to be enriched by the external lookup + * @param resultFuture the future used to emit the enriched row or an error + * @throws Exception if processing fails in an unrecoverable way + */ @Override public void asyncInvoke(Row input, ResultFuture resultFuture) throws Exception { @@ -215,6 +268,15 @@ protected void reportAndThrowError(ResultFuture resultFuture, Exception exc resultFuture.completeExceptionally(exception); } + /** + * {@inheritDoc} + * + *

Marks a timeout metric and either reports and throws the error (when the source is configured to fail + * on errors) or reports it as non-fatal, finally completing the future with the original unmodified input row. + * + * @param input the row whose external call timed out + * @param resultFuture the future used to emit the fallback row or an error + */ @Override public void timeout(Row input, ResultFuture resultFuture) { meterStatsManager.markEvent(ExternalSourceAspects.TIMEOUTS); @@ -227,22 +289,46 @@ public void timeout(Row input, ResultFuture resultFuture) { resultFuture.complete(singleton(input)); } + /** + * {@inheritDoc} + * + *

Delegates to the parent implementation to release any resources held by the rich async function. + * + * @throws Exception if the parent cleanup fails + */ @Override public void close() throws Exception { super.close(); } + /** + * {@inheritDoc} + * + * @return the telemetry collected for this connector, keyed by telemetry type + */ @Override public Map> getTelemetry() { return metrics; } + /** + * {@inheritDoc} + * + *

Records the post-processor type telemetry entry for this connector's source type before telemetry + * subscribers are notified. + */ @Override public void preProcessBeforeNotifyingSubscriber() { addMetric(TelemetryTypes.POST_PROCESSOR_TYPE.getValue(), sourceType); } + /** + * Appends a telemetry value under the given key, creating the backing list on first use. + * + * @param key the telemetry key to record the value under + * @param value the telemetry value to add + */ private void addMetric(String key, String value) { metrics.computeIfAbsent(key, k -> new ArrayList<>()).add(value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalMetricConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalMetricConfig.java index 99183e998..5bd41e415 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalMetricConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalMetricConfig.java @@ -10,9 +10,21 @@ * The External metric config. */ public class ExternalMetricConfig implements Serializable { + /** + * The grace period in milliseconds to wait for telemetry reporters to flush before shutting down. + */ private final long shutDownPeriod; + /** + * Whether telemetry/metric reporting is enabled for the external source. + */ private final boolean telemetryEnabled; + /** + * The subscriber notified of telemetry produced by the external source. + */ private TelemetrySubscriber telemetrySubscriber; + /** + * The identifier used to distinguish metrics emitted for a specific external source instance. + */ private String metricId; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalPostProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalPostProcessor.java index fd91e74f1..669095bcd 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalPostProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalPostProcessor.java @@ -26,8 +26,17 @@ */ public class ExternalPostProcessor implements PostProcessor { + /** + * The schema configuration shared with the stream decorators and async connectors created here. + */ private final SchemaConfig schemaConfig; + /** + * The configuration holding all external lookup sources (HTTP, Elasticsearch, Postgres, and gRPC). + */ private final ExternalSourceConfig externalSourceConfig; + /** + * The metric configuration applied to each external source, including the per-source metric id. + */ private final ExternalMetricConfig externalMetricConfig; /** @@ -43,12 +52,31 @@ public ExternalPostProcessor(SchemaConfig schemaConfig, ExternalSourceConfig ext this.externalMetricConfig = externalMetricConfig; } + /** + * {@inheritDoc} + * + *

Indicates that this post-processor runs only when the post-processor configuration declares an + * external source. + * + * @param postProcessorConfig the post-processor configuration to inspect + * @return {@code true} if an external source is configured, otherwise {@code false} + */ @Override public boolean canProcess(PostProcessorConfig postProcessorConfig) { return postProcessorConfig.hasExternalSource(); } + /** + * {@inheritDoc} + * + *

Enriches the incoming stream by chaining an async lookup for every configured HTTP, Elasticsearch, + * Postgres, and gRPC source, assigning each source a metric id, and returns a new {@link StreamInfo} + * wrapping the enriched stream with the original column names. + * + * @param streamInfo the stream and column metadata to enrich + * @return a new {@link StreamInfo} carrying the enriched data stream + */ @Override public StreamInfo process(StreamInfo streamInfo) { DataStream resultStream = streamInfo.getDataStream(); @@ -84,11 +112,26 @@ public StreamInfo process(StreamInfo streamInfo) { return new StreamInfo(resultStream, streamInfo.getColumnNames()); } + /** + * Resolves the metric id for a source, falling back to its positional index when none is configured. + * + * @param index the positional index of the source within its configured list + * @param sourceConfig the source configuration to read the metric id from + * @return the configured metric id, or the index as a string when it is empty + */ private String getMetricId(int index, SourceConfig sourceConfig) { String metricId = sourceConfig.getMetricId(); return (StringUtils.isEmpty(metricId)) ? String.valueOf(index) : metricId; } + /** + * Validates the given source configuration and applies the decorator that attaches the async lookup. + * + * @param resultStream the stream to be enriched + * @param configs the source configuration whose fields are validated before decoration + * @param decorator the decorator that wires the async connector onto the stream + * @return the decorated stream + */ private DataStream enrichStream(DataStream resultStream, Validator configs, StreamDecorator decorator) { configs.validateFields(); return decorator.decorate(resultStream); @@ -114,10 +157,22 @@ protected EsStreamDecorator getEsDecorator(EsSourceConfig esSourceConfig) { return new EsStreamDecorator(esSourceConfig, externalMetricConfig, schemaConfig); } + /** + * Builds the Postgres stream decorator for the given source configuration. + * + * @param pgSourceConfig the Postgres source configuration + * @return a new {@link PgStreamDecorator} bound to the shared metric and schema configuration + */ private PgStreamDecorator getPgDecorator(PgSourceConfig pgSourceConfig) { return new PgStreamDecorator(pgSourceConfig, externalMetricConfig, schemaConfig); } + /** + * Builds the gRPC stream decorator for the given source configuration. + * + * @param grpcSourceConfig the gRPC source configuration + * @return a new {@link GrpcStreamDecorator} bound to the shared metric and schema configuration + */ private GrpcStreamDecorator getGrpcDecorator(GrpcSourceConfig grpcSourceConfig) { return new GrpcStreamDecorator(grpcSourceConfig, externalMetricConfig, schemaConfig); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalSourceConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalSourceConfig.java index 0c6e372b7..781c591b0 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalSourceConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/ExternalSourceConfig.java @@ -13,9 +13,21 @@ * A class that holds external post processor configuration. */ public class ExternalSourceConfig { + /** + * The list of HTTP external lookup configurations, or {@code null} when none are configured. + */ private List http; + /** + * The list of Elasticsearch external lookup configurations, or {@code null} when none are configured. + */ private List es; + /** + * The list of Postgres external lookup configurations, or {@code null} when none are configured. + */ private List pg; + /** + * The list of gRPC external lookup configurations, or {@code null} when none are configured. + */ private List grpc; /** @@ -93,6 +105,13 @@ public List getOutputColumnNames() { return columnNames; } + /** + * Collects the output column names contributed by each source in the given list. + * + * @param the concrete {@link SourceConfig} type held in the list + * @param configs the source configurations to read output columns from; may be {@code null} + * @return the aggregated output column names, or an empty list when {@code configs} is {@code null} + */ private ArrayList getOutputColumnNames(List configs) { ArrayList columnNames = new ArrayList<>(); if (configs == null) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsAsyncConnector.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsAsyncConnector.java index fe65e0eed..e9279aafc 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsAsyncConnector.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsAsyncConnector.java @@ -29,7 +29,13 @@ */ public class EsAsyncConnector extends AsyncConnector { + /** + * The Elasticsearch source configuration backing this connector's lookups. + */ private final EsSourceConfig esSourceConfig; + /** + * The low-level Elasticsearch REST client used to issue asynchronous search requests. + */ private RestClient esClient; /** @@ -62,6 +68,12 @@ public EsAsyncConnector(EsSourceConfig esSourceConfig, ExternalMetricConfig exte this.esSourceConfig = esSourceConfig; } + /** + * {@inheritDoc} + * + *

Lazily builds the Elasticsearch {@link RestClient} from the configured hosts, attaching the + * credentials provider, request timeouts, and maximum retry timeout, unless a client was already injected. + */ @Override protected void createClient() { if (esClient == null) { @@ -74,6 +86,16 @@ protected void createClient() { } } + /** + * {@inheritDoc} + * + *

Resolves the endpoint variables from the input row, validates the query, formats the configured + * endpoint pattern, and issues an asynchronous {@code GET} request whose response is handled by an + * {@link EsResponseHandler}. + * + * @param input the incoming row supplying the endpoint variable values + * @param resultFuture the future completed with the enriched row or an error + */ @Override protected void process(Row input, ResultFuture resultFuture) { RowManager rowManager = new RowManager(input); @@ -90,6 +112,11 @@ protected void process(Row input, ResultFuture resultFuture) { esClient.performRequestAsync(esRequest, esResponseHandler); } + /** + * Parses the comma-separated host configuration into an array of {@link HttpHost} entries. + * + * @return the Elasticsearch hosts to connect to, each bound to the configured port + */ private HttpHost[] getHttpHosts() { List hosts = Arrays.asList(esSourceConfig.getHost().split(",")); ArrayList httpHosts = new ArrayList<>(); @@ -97,12 +124,22 @@ private HttpHost[] getHttpHosts() { return httpHosts.toArray(new HttpHost[0]); } + /** + * Builds the request configuration carrying the configured connect and socket timeouts. + * + * @return the {@link RequestConfig} applied to each Elasticsearch request + */ private RequestConfig getRequestConfig() { return RequestConfig.custom() .setConnectTimeout(esSourceConfig.getConnectTimeout()) .setSocketTimeout(esSourceConfig.getSocketTimeout()).build(); } + /** + * Builds a credentials provider seeded with the configured username and password for basic auth. + * + * @return the {@link CredentialsProvider} used to authenticate Elasticsearch requests + */ private CredentialsProvider getCredentialsProvider() { final CredentialsProvider credentialsProvider = new BasicCredentialsProvider(); credentialsProvider.setCredentials(AuthScope.ANY, diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsResponseHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsResponseHandler.java index 8b87d66fe..41a1cdf7e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsResponseHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsResponseHandler.java @@ -37,15 +37,45 @@ * The ElasticSearch response handler. */ public class EsResponseHandler implements ResponseListener { + /** + * Logger used to record Elasticsearch response parsing and processing errors. + */ private static final Logger LOGGER = LoggerFactory.getLogger(EsResponseHandler.class.getName()); + /** + * The Elasticsearch source configuration describing output mappings and error behaviour. + */ private EsSourceConfig esSourceConfig; + /** + * Manager wrapping the input and output rows that response values are written into. + */ private RowManager rowManager; + /** + * The protobuf descriptor of the output message used to type-cast response values. + */ private Descriptor outputDescriptor; + /** + * The future completed with the enriched row once the response has been handled. + */ private ResultFuture resultFuture; + /** + * The time at which the request was issued, used to compute response latency telemetry. + */ private Instant startTime; + /** + * Manager used to emit meter-style metrics for response outcomes. + */ private MeterStatsManager meterStatsManager; + /** + * Resolver mapping output column names to their positions in the output row. + */ private ColumnNameManager columnNameManager; + /** + * Reporter used to surface fatal and non-fatal errors raised while handling responses. + */ private ErrorReporter errorReporter; + /** + * Helper that emits success and failure telemetry for the external call. + */ private PostResponseTelemetry postResponseTelemetry; /** @@ -78,6 +108,15 @@ public void startTimer() { startTime = Instant.now(); } + /** + * {@inheritDoc} + * + *

For a successful ({@code 200 OK}) response, reads each configured output column from the JSON body + * using its JSON path and writes the value into the output row. Path, parse, read, and other errors are + * recorded as telemetry and reported. The result future is always completed with the current row. + * + * @param response the Elasticsearch response to read enrichment values from + */ @Override public void onSuccess(Response response) { try { @@ -121,6 +160,15 @@ public void onSuccess(Response response) { } } + /** + * {@inheritDoc} + * + *

Records failure telemetry and either reports and throws an {@code HttpFailureException} (when the + * source is configured to fail on errors) or reports it as non-fatal, validating the HTTP status code when + * the cause is a {@code ResponseException}. The result future is always completed with the current row. + * + * @param e the exception describing the Elasticsearch failure + */ @Override public void onFailure(Exception e) { postResponseTelemetry.sendFailureTelemetry(meterStatsManager, startTime); @@ -140,6 +188,18 @@ public void onFailure(Exception e) { } + /** + * Writes a single response value into the output row at the given index. + * + *

When the response type is not retained (or an explicit type is configured), the value is converted + * using the field's {@link TypeHandler}, and map values are built into a nested {@link Row}. A missing + * field descriptor is reported as an error. + * + * @param esConfig the Elasticsearch source configuration controlling type handling + * @param index the output row index to write the value into + * @param value the raw value read from the response + * @param name the output column (field) name being populated + */ private void setField(EsSourceConfig esConfig, int index, Object value, String name) { if (!esConfig.isRetainResponseType() || esConfig.hasType()) { Descriptors.FieldDescriptor fieldDescriptor = outputDescriptor.findFieldByName(name); @@ -160,6 +220,11 @@ private void setField(EsSourceConfig esConfig, int index, Object value, String n } } + /** + * Reports the given exception as fatal and completes the result future exceptionally. + * + * @param exception the exception to report and propagate + */ private void reportAndThrowError(Exception exception) { errorReporter.reportFatalException(exception); resultFuture.completeExceptionally(exception); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfig.java index 13d2fa2cf..8e98089bd 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfig.java @@ -16,23 +16,73 @@ * A class that holds ElasticSearch configuration. */ public class EsSourceConfig implements Serializable, SourceConfig { + /** + * The comma-separated Elasticsearch host(s) to connect to. + */ private final String host; + /** + * The Elasticsearch port to connect to. + */ private final String port; + /** + * The username used for basic authentication, or empty when unauthenticated. + */ private final String user; + /** + * The password used for basic authentication, or empty when unauthenticated. + */ private final String password; + /** + * The format pattern used to build the Elasticsearch endpoint from the resolved variables. + */ private final String endpointPattern; + /** + * The comma-separated input columns whose values are substituted into the endpoint pattern. + */ private final String endpointVariables; + /** + * The fully-qualified protobuf class name used to type-cast the response, recognised under the JSON keys + * {@code type}, {@code Type}, or {@code TYPE}. + */ @SerializedName(value = "type", alternate = {"Type", "TYPE"}) private final String type; + /** + * The maximum number of concurrent asynchronous requests buffered by the async operator. + */ private final String capacity; + /** + * The maximum retry timeout in milliseconds applied by the Elasticsearch client. + */ private final String retryTimeout; + /** + * The socket timeout in milliseconds for an Elasticsearch request. + */ private final String socketTimeout; + /** + * The overall stream/async timeout in milliseconds for the lookup. + */ private final String streamTimeout; + /** + * The connection timeout in milliseconds for establishing an Elasticsearch connection. + */ private final String connectTimeout; + /** + * Whether a lookup failure should fail the job ({@code true}) or be tolerated as non-fatal ({@code false}). + */ private final boolean failOnErrors; + /** + * The mapping of output column name to the configuration describing how to extract its value. + */ private final Map outputMapping; + /** + * Optional identifier used to disambiguate metrics emitted for this source, recognised under the JSON keys + * {@code metricId}, {@code MetricId}, or {@code METRICID}. + */ @SerializedName(value = "metricId", alternate = {"MetricId", "METRICID"}) private final String metricId; + /** + * Whether the raw response value type is retained as-is instead of being cast to the configured proto type. + */ private final boolean retainResponseType; @@ -114,26 +164,51 @@ public String getPassword() { return password == null ? "" : password; } + /** + * {@inheritDoc} + * + * @return the endpoint pattern used to build the Elasticsearch request + */ @Override public String getPattern() { return endpointPattern; } + /** + * {@inheritDoc} + * + * @return the comma-separated endpoint variable column names + */ @Override public String getVariables() { return endpointVariables; } + /** + * {@inheritDoc} + * + * @return {@code true} if lookup failures should fail the job, otherwise {@code false} + */ @Override public boolean isFailOnErrors() { return failOnErrors; } + /** + * {@inheritDoc} + * + * @return the configured metric id for this source, or {@code null} when unset + */ @Override public String getMetricId() { return metricId; } + /** + * {@inheritDoc} + * + * @return the configured protobuf type name used to cast the response, or {@code null} when unset + */ @Override public String getType() { return type; @@ -203,6 +278,14 @@ public String getPath(String outputColumn) { return outputMapping.get(outputColumn).getPath(); } + /** + * {@inheritDoc} + * + *

Exposes the host, port, endpoint pattern, capacity, timeouts, fail-on-errors flag, and output mapping + * as the fields that must be present for this configuration to be valid. + * + * @return a map of mandatory field names to their configured values + */ @Override public HashMap getMandatoryFields() { HashMap mandatoryFields = new HashMap<>(); @@ -220,6 +303,11 @@ public HashMap getMandatoryFields() { return mandatoryFields; } + /** + * {@inheritDoc} + * + * @return the output column names derived from the configured output mapping keys + */ @Override public List getOutputColumns() { return new ArrayList<>(outputMapping.keySet()); @@ -234,6 +322,12 @@ public boolean isRetainResponseType() { return retainResponseType; } + /** + * {@inheritDoc} + * + * @param o the object to compare with this configuration + * @return {@code true} if the other object is an {@code EsSourceConfig} with equal fields + */ @Override public boolean equals(Object o) { if (this == o) { @@ -246,6 +340,11 @@ public boolean equals(Object o) { return failOnErrors == that.failOnErrors && retainResponseType == that.retainResponseType && Objects.equals(host, that.host) && Objects.equals(port, that.port) && Objects.equals(user, that.user) && Objects.equals(password, that.password) && Objects.equals(endpointPattern, that.endpointPattern) && Objects.equals(endpointVariables, that.endpointVariables) && Objects.equals(type, that.type) && Objects.equals(capacity, that.capacity) && Objects.equals(retryTimeout, that.retryTimeout) && Objects.equals(socketTimeout, that.socketTimeout) && Objects.equals(streamTimeout, that.streamTimeout) && Objects.equals(connectTimeout, that.connectTimeout) && Objects.equals(outputMapping, that.outputMapping) && Objects.equals(metricId, that.metricId); } + /** + * {@inheritDoc} + * + * @return a hash code derived from all configuration fields + */ @Override public int hashCode() { return Objects.hash(host, port, user, password, endpointPattern, endpointVariables, type, capacity, retryTimeout, socketTimeout, streamTimeout, connectTimeout, failOnErrors, outputMapping, metricId, retainResponseType); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfigBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfigBuilder.java index 6f6f59dfd..32ad1f603 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfigBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsSourceConfigBuilder.java @@ -4,104 +4,229 @@ import java.util.Map; +/** + * Fluent builder for {@link EsSourceConfig}. + * + *

Each setter assigns one configuration property and returns {@code this} so calls can be chained; + * {@link #createEsSourceConfig()} then produces the immutable config. It is a convenient way to + * assemble an Elasticsearch source config in code (for example in tests) without the long positional + * constructor. + */ public class EsSourceConfigBuilder { + /** Comma-separated Elasticsearch hosts. */ private String host; + /** Port shared by the configured hosts. */ private String port; + /** Basic-auth username, or {@code null} when unused. */ private String user; + /** Basic-auth password, or {@code null} when unused. */ private String password; + /** Endpoint pattern populated with the endpoint variables. */ private String endpointPattern; + /** Comma-separated input columns filling the endpoint pattern. */ private String endpointVariables; + /** Optional output protobuf class name. */ private String type; + /** Maximum number of concurrent async requests. */ private String capacity; + /** Connection timeout in milliseconds. */ private String connectTimeout; + /** Maximum retry timeout in milliseconds. */ private String retryTimeout; + /** Socket timeout in milliseconds. */ private String socketTimeout; + /** Async-IO stream timeout in milliseconds. */ private String streamTimeout; + /** Whether a failed lookup should fail the job. */ private boolean failOnErrors; + /** Output column to JSON-path mapping. */ private Map outputMapping; + /** Optional metric id grouping this source's metrics. */ private String metricId; + /** Whether to keep raw response values without type coercion. */ private boolean retainResponseType; + /** + * Sets the comma-separated Elasticsearch hosts. + * + * @param host the hosts to connect to + * @return this builder for chaining + */ public EsSourceConfigBuilder setHost(String host) { this.host = host; return this; } + /** + * Sets the port shared by the hosts. + * + * @param port the Elasticsearch port + * @return this builder for chaining + */ public EsSourceConfigBuilder setPort(String port) { this.port = port; return this; } + /** + * Sets the basic-auth username. + * + * @param user the username, or {@code null} when auth is not used + * @return this builder for chaining + */ public EsSourceConfigBuilder setUser(String user) { this.user = user; return this; } + /** + * Sets the basic-auth password. + * + * @param password the password, or {@code null} when auth is not used + * @return this builder for chaining + */ public EsSourceConfigBuilder setPassword(String password) { this.password = password; return this; } + /** + * Sets the endpoint pattern used to build the request URI. + * + * @param endpointPattern the pattern populated with the endpoint variables + * @return this builder for chaining + */ public EsSourceConfigBuilder setEndpointPattern(String endpointPattern) { this.endpointPattern = endpointPattern; return this; } + /** + * Sets the comma-separated input columns substituted into the endpoint pattern. + * + * @param endpointVariables the input column names + * @return this builder for chaining + */ public EsSourceConfigBuilder setEndpointVariables(String endpointVariables) { this.endpointVariables = endpointVariables; return this; } + /** + * Sets the optional output protobuf class name. + * + * @param type the protobuf class name used to type the response + * @return this builder for chaining + */ public EsSourceConfigBuilder setType(String type) { this.type = type; return this; } + /** + * Sets the maximum number of concurrent async requests. + * + * @param capacity the async capacity + * @return this builder for chaining + */ public EsSourceConfigBuilder setCapacity(String capacity) { this.capacity = capacity; return this; } + /** + * Sets the connection timeout. + * + * @param connectTimeout the connect timeout in milliseconds + * @return this builder for chaining + */ public EsSourceConfigBuilder setConnectTimeout(String connectTimeout) { this.connectTimeout = connectTimeout; return this; } + /** + * Sets the maximum retry timeout. + * + * @param retryTimeout the retry timeout in milliseconds + * @return this builder for chaining + */ public EsSourceConfigBuilder setRetryTimeout(String retryTimeout) { this.retryTimeout = retryTimeout; return this; } + /** + * Sets the socket timeout. + * + * @param socketTimeout the socket timeout in milliseconds + * @return this builder for chaining + */ public EsSourceConfigBuilder setSocketTimeout(String socketTimeout) { this.socketTimeout = socketTimeout; return this; } + /** + * Sets the async-IO stream timeout. + * + * @param streamTimeout the stream timeout in milliseconds + * @return this builder for chaining + */ public EsSourceConfigBuilder setStreamTimeout(String streamTimeout) { this.streamTimeout = streamTimeout; return this; } + /** + * Sets whether a failed lookup should fail the job. + * + * @param failOnErrors {@code true} to fail the job on lookup errors + * @return this builder for chaining + */ public EsSourceConfigBuilder setFailOnErrors(boolean failOnErrors) { this.failOnErrors = failOnErrors; return this; } + /** + * Sets the output column to JSON-path mapping. + * + * @param outputMapping the mapping from output column name to its response path + * @return this builder for chaining + */ public EsSourceConfigBuilder setOutputMapping(Map outputMapping) { this.outputMapping = outputMapping; return this; } + /** + * Sets the optional metric id grouping this source's metrics. + * + * @param metricId the metric id + * @return this builder for chaining + */ public EsSourceConfigBuilder setMetricId(String metricId) { this.metricId = metricId; return this; } + /** + * Sets whether raw response values are kept without type coercion. + * + * @param retainResponseType {@code true} to retain the raw response type + * @return this builder for chaining + */ public EsSourceConfigBuilder setRetainResponseType(boolean retainResponseType) { this.retainResponseType = retainResponseType; return this; } + /** + * Builds an immutable {@link EsSourceConfig} from the values accumulated in this builder. + * + * @return the assembled Elasticsearch source config + */ public EsSourceConfig createEsSourceConfig() { return new EsSourceConfig(host, port, user, password, endpointPattern, endpointVariables, type, capacity, connectTimeout, retryTimeout, socketTimeout, streamTimeout, failOnErrors, outputMapping, metricId, retainResponseType); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsStreamDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsStreamDecorator.java index 179f1480f..c562fa4b9 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsStreamDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/es/EsStreamDecorator.java @@ -15,8 +15,17 @@ */ public class EsStreamDecorator implements StreamDecorator { + /** + * The Elasticsearch source configuration passed to the connector created by this decorator. + */ private final EsSourceConfig esSourceConfig; + /** + * The metric configuration applied to the connector, including telemetry and the metric id. + */ private final ExternalMetricConfig externalMetricConfig; + /** + * The schema configuration providing column and proto metadata to the connector. + */ private final SchemaConfig schemaConfig; /** @@ -32,11 +41,25 @@ public EsStreamDecorator(EsSourceConfig esSourceConfig, ExternalMetricConfig ext this.schemaConfig = schemaConfig; } + /** + * {@inheritDoc} + * + * @return {@code true} when an Elasticsearch source configuration is present + */ @Override public Boolean canDecorate() { return esSourceConfig != null; } + /** + * {@inheritDoc} + * + *

Wraps the input stream in an ordered asynchronous Elasticsearch lookup, registering the telemetry + * subscriber and applying the configured stream timeout and capacity. + * + * @param inputStream the stream to enrich with Elasticsearch lookups + * @return the asynchronously enriched stream + */ @Override public DataStream decorate(DataStream inputStream) { EsAsyncConnector esAsyncConnector = new EsAsyncConnector(esSourceConfig, externalMetricConfig, schemaConfig); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcAsyncConnector.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcAsyncConnector.java index b200c30f6..523dc15e5 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcAsyncConnector.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcAsyncConnector.java @@ -29,10 +29,19 @@ */ public class GrpcAsyncConnector extends AsyncConnector { + /** + * Logger used to record gRPC connection lifecycle events. + */ private static final Logger LOGGER = LoggerFactory.getLogger(GrpcAsyncConnector.class.getName()); + /** + * The gRPC source configuration backing this connector's lookups. + */ private GrpcSourceConfig grpcSourceConfig; + /** + * The gRPC client used to issue asynchronous unary calls to the configured service. + */ private GrpcClient grpcClient; /** @@ -68,11 +77,26 @@ public GrpcAsyncConnector(GrpcSourceConfig grpcSourceConfig, ExternalMetricConfi } + /** + * {@inheritDoc} + * + *

Builds a {@link DescriptorManager} that resolves protobuf descriptors from the configured gRPC + * stencil URLs in addition to the stencil client orchestrator. + * + * @param schemaConfig the schema configuration providing the stencil client orchestrator + * @return a descriptor manager backed by the configured gRPC stencil URLs + */ @Override public DescriptorManager initDescriptorManager(SchemaConfig schemaConfig) { return new DescriptorManager(schemaConfig.getStencilClientOrchestrator(), grpcSourceConfig.getGrpcStencilUrl()); } + /** + * {@inheritDoc} + * + *

Lazily creates the {@link GrpcClient} from the source configuration and opens its managed channel, + * unless a client was already injected. + */ @Override protected void createClient() { @@ -82,6 +106,18 @@ protected void createClient() { } } + /** + * {@inheritDoc} + * + *

Resolves the request variables from the input row, validates the query, builds a request + * {@link DynamicMessage} via a {@link GrpcRequestHandler}, and issues an asynchronous unary call whose + * response is handled by a {@link GrpcResponseHandler}. Invalid bodies and unavailable channels are + * recorded as telemetry and propagated through the result future. + * + * @param input the incoming row supplying the request variable values + * @param resultFuture the future completed with the enriched row or an error + * @throws Exception if the call cannot be initiated + */ @Override protected void process(Row input, ResultFuture resultFuture) throws Exception { @@ -114,6 +150,12 @@ protected void process(Row input, ResultFuture resultFuture) throws Excepti } + /** + * Resolves the protobuf descriptor for the configured gRPC response schema. + * + * @param resultFuture the future used to report a missing descriptor as an error + * @return the response message descriptor, or {@code null} when no schema is configured + */ private Descriptors.Descriptor getOutputDescriptorForGrpcResponse(ResultFuture resultFuture) { String descriptorClassName = grpcSourceConfig.getGrpcResponseProtoSchema(); Descriptors.Descriptor grpcProtoDescriptor = null; @@ -127,6 +169,12 @@ private Descriptors.Descriptor getOutputDescriptorForGrpcResponse(ResultFuture resultFuture) { String descriptorClassName = grpcSourceConfig.getGrpcRequestProtoSchema(); Descriptors.Descriptor grpcProtoDescriptor = null; @@ -140,6 +188,11 @@ private Descriptors.Descriptor getInputDescriptorForGrpcRequest(ResultFutureCloses the gRPC client, releases the channel, records a connection-close metric, and logs the closure. + */ @Override public void close() { grpcClient.close(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcResponseHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcResponseHandler.java index d88b39ca1..b453ad3f8 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcResponseHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcResponseHandler.java @@ -34,15 +34,45 @@ */ public class GrpcResponseHandler implements StreamObserver { + /** + * Logger used to record gRPC response parsing and processing errors. + */ private static final Logger LOGGER = LoggerFactory.getLogger(GrpcResponseHandler.class.getName()); + /** + * Manager wrapping the input and output rows that response values are written into. + */ private final RowManager rowManager; + /** + * Resolver mapping output column names to their positions in the output row. + */ private ColumnNameManager columnNameManager; + /** + * The protobuf descriptor of the output message used to type-cast response values. + */ private Descriptors.Descriptor descriptor; + /** + * The future completed with the enriched row once the response has been handled. + */ private ResultFuture resultFuture; + /** + * The gRPC source configuration describing output mappings and error behaviour. + */ private GrpcSourceConfig grpcSourceConfig; + /** + * Manager used to emit meter-style metrics for response outcomes. + */ private MeterStatsManager meterStatsManager; + /** + * The time at which the request was issued, used to compute response latency telemetry. + */ private Instant startTime; + /** + * Reporter used to surface fatal and non-fatal errors raised while handling responses. + */ private ErrorReporter errorReporter; + /** + * Helper that emits success and failure telemetry for the external call. + */ private PostResponseTelemetry postResponseTelemetry; /** @@ -69,6 +99,15 @@ public GrpcResponseHandler(GrpcSourceConfig grpcSourceConfig, MeterStatsManager this.postResponseTelemetry = postResponseTelemetry; } + /** + * Handles a successful gRPC response by extracting each configured output value and populating the row. + * + *

The response message is rendered to JSON and each configured output mapping is read via its JSON + * path and written into the output row; path or protobuf parsing failures are reported as errors. On + * success the result future is completed with the enriched row. + * + * @param message the response message returned by the gRPC service + */ private void successHandler(DynamicMessage message) { Map outputMappings = grpcSourceConfig.getOutputMapping(); ArrayList outputMappingKeys = new ArrayList<>(outputMappings.keySet()); @@ -103,6 +142,16 @@ private void successHandler(DynamicMessage message) { } + /** + * Writes a single response value into the output row at the given index. + * + *

When the response type is not retained (or an explicit type is configured), the value is converted + * using the field's type handler; otherwise the raw value is stored directly. + * + * @param key the output column (field) name being populated + * @param value the raw value read from the response + * @param fieldIndex the output row index to write the value into + */ private void setField(String key, Object value, int fieldIndex) { if (!grpcSourceConfig.isRetainResponseType() || grpcSourceConfig.hasType()) { setFieldUsingType(key, value, fieldIndex); @@ -111,6 +160,13 @@ private void setField(String key, Object value, int fieldIndex) { } } + /** + * Converts and writes a response value using the type handler resolved from the output descriptor. + * + * @param key the output column (field) name being populated + * @param value the raw value read from the response + * @param fieldIndex the output row index to write the value into + */ private void setFieldUsingType(String key, Object value, int fieldIndex) { Descriptors.FieldDescriptor fieldDescriptor = null; try { @@ -126,6 +182,11 @@ private void setFieldUsingType(String key, Object value, int fieldIndex) { } + /** + * Reports the given exception as fatal and completes the result future exceptionally. + * + * @param e the exception to report and propagate + */ private void reportAndThrowError(Exception e) { errorReporter.reportFatalException(e); resultFuture.completeExceptionally(e); @@ -156,11 +217,25 @@ public void startTimer() { startTime = Instant.now(); } + /** + * {@inheritDoc} + * + *

Delegates the received response message to the success handler. + * + * @param message the response message emitted by the gRPC stream + */ @Override public void onNext(DynamicMessage message) { successHandler(message); } + /** + * {@inheritDoc} + * + *

Records an error metric and routes the failure through the failure handler. + * + * @param t the error raised by the gRPC stream + */ @Override public void onError(Throwable t) { t.printStackTrace(); @@ -169,6 +244,12 @@ public void onError(Throwable t) { } + /** + * {@inheritDoc} + * + *

No action is required when the gRPC stream completes, as results are emitted from + * {@link #onNext(DynamicMessage)}. + */ @Override public void onCompleted() { } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfig.java index 2a88745ae..3d7da5fd1 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfig.java @@ -17,26 +17,85 @@ * A class that holds Grpc configuration. */ public class GrpcSourceConfig implements Serializable, SourceConfig { + /** + * The hostname/address of the gRPC service to call. + */ private String endpoint; + /** + * The port of the gRPC service to call. + */ private int servicePort; + /** + * The fully-qualified protobuf class name describing the gRPC request message. + */ private String grpcRequestProtoSchema; + /** + * The fully-qualified protobuf class name describing the gRPC response message. + */ private String grpcResponseProtoSchema; + /** + * The fully-qualified gRPC method URL ({@code package.Service/Method}) to invoke. + */ private String grpcMethodUrl; + /** + * The format pattern used to build the JSON request body from the resolved variables. + */ private String requestPattern; + /** + * The comma-separated input columns whose values are substituted into the request pattern. + */ private String requestVariables; + /** + * The gRPC keepalive ping interval in milliseconds, or {@code null} to use the default. + */ private String grpcArgKeepaliveTimeMs; + /** + * The gRPC keepalive ping timeout in milliseconds, or {@code null} to use the default. + */ private String grpcArgKeepaliveTimeoutMs; + /** + * The overall stream/async timeout in milliseconds for the lookup. + */ private String streamTimeout; + /** + * The connection timeout in milliseconds for establishing the gRPC channel. + */ private String connectTimeout; + /** + * Whether a lookup failure should fail the job ({@code true}) or be tolerated as non-fatal ({@code false}). + */ private boolean failOnErrors; + /** + * The fully-qualified protobuf class name used to type-cast the response, when configured. + */ private String type; + /** + * Whether the raw response value type is retained as-is instead of being cast to the configured proto type. + */ private boolean retainResponseType; + /** + * The comma-separated stencil URLs used to resolve gRPC request/response descriptors. + */ private String grpcStencilUrl; + /** + * Optional gRPC metadata headers sent with each call, recognised under the JSON keys {@code headers}, + * {@code Headers}, or {@code HEADERS}. + */ @SerializedName(value = "headers", alternate = {"Headers", "HEADERS"}) private Map headers; + /** + * The mapping of output column name to the configuration describing how to extract its value. + */ private Map outputMapping; + /** + * Optional identifier used to disambiguate metrics emitted for this source, recognised under the JSON keys + * {@code metricId}, {@code MetricId}, or {@code METRICID}. + */ @SerializedName(value = "metricId", alternate = {"MetricId", "METRICID"}) private String metricId; + /** + * The maximum number of concurrent asynchronous requests buffered by the async operator. + */ private int capacity; /** @@ -125,11 +184,21 @@ public String getEndpoint() { return endpoint; } + /** + * {@inheritDoc} + * + * @return the request pattern used to build the gRPC request body + */ @Override public String getPattern() { return requestPattern; } + /** + * {@inheritDoc} + * + * @return the comma-separated request variable column names + */ @Override public String getVariables() { return requestVariables; @@ -144,15 +213,30 @@ public Integer getStreamTimeout() { return Integer.valueOf(streamTimeout); } + /** + * Returns whether a lookup failure should fail the job. + * + * @return {@code true} if lookup failures should fail the job, otherwise {@code false} + */ public boolean isFailOnErrors() { return failOnErrors; } + /** + * {@inheritDoc} + * + * @return the configured metric id for this source, or {@code null} when unset + */ @Override public String getMetricId() { return metricId; } + /** + * Gets the configured protobuf type name used to cast the response. + * + * @return the configured type name, or {@code null} when unset + */ public String getType() { return type; } @@ -175,11 +259,24 @@ public Map getOutputMapping() { return outputMapping; } + /** + * {@inheritDoc} + * + * @return the output column names derived from the configured output mapping keys + */ @Override public List getOutputColumns() { return new ArrayList<>(outputMapping.keySet()); } + /** + * Gets the fields that must be present for this configuration to be valid. + * + *

Exposes the endpoint, service port, request/response proto schemas, method url, fail-on-errors flag, + * request pattern, request variables, timeouts, and output mapping. + * + * @return a map of mandatory field names to their configured values + */ public HashMap getMandatoryFields() { HashMap mandatoryFields = new HashMap<>(); mandatoryFields.put("endpoint", endpoint); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfigBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfigBuilder.java index da9393926..97b102279 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfigBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcSourceConfigBuilder.java @@ -4,122 +4,267 @@ import java.util.Map; +/** + * Fluent builder for {@link GrpcSourceConfig}. + * + *

Each setter assigns one configuration property and returns {@code this} so calls can be chained; + * {@link #createGrpcSourceConfig()} then produces the config. It is a convenient way to assemble a + * gRPC source config in code (for example in tests) without the long positional constructor. + */ public class GrpcSourceConfigBuilder { + /** Hostname or address of the gRPC service. */ private String endpoint; + /** Port of the gRPC service. */ private int servicePort; + /** Request message protobuf class name. */ private String grpcRequestProtoSchema; + /** Response message protobuf class name. */ private String grpcResponseProtoSchema; + /** Fully-qualified gRPC method name to invoke. */ private String grpcMethodUrl; + /** Channel keepalive interval in milliseconds. */ private String grpcArgKeepaliveTimeMs; + /** Channel keepalive timeout in milliseconds. */ private String grpcArgKeepaliveTimeoutMs; + /** Request body pattern populated with the request variables. */ private String requestPattern; + /** Comma-separated input columns filling the request pattern. */ private String requestVariables; + /** Output column to JSON-path mapping. */ private Map outputMapping; + /** Async-IO stream timeout in milliseconds. */ private String streamTimeout; + /** Connection timeout in milliseconds. */ private String connectTimeout; + /** Whether a failed lookup should fail the job. */ private boolean failOnErrors; + /** Comma-separated stencil URLs for descriptor resolution. */ private String grpcStencilUrl; + /** Optional output protobuf class name. */ private String type; + /** Whether to keep raw response values without type coercion. */ private boolean retainResponseType; + /** gRPC call metadata headers. */ private Map headers; + /** Optional metric id grouping this source's metrics. */ private String metricId; + /** Maximum number of concurrent async requests. */ private int capacity; + /** + * Sets the gRPC service endpoint. + * + * @param endpoint the endpoint host or address + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setEndpoint(String endpoint) { this.endpoint = endpoint; return this; } + /** + * Sets the gRPC service port. + * + * @param servicePort the service port + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setServicePort(int servicePort) { this.servicePort = servicePort; return this; } + /** + * Sets the request message protobuf class name. + * + * @param grpcRequestProtoSchema the request proto schema + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setGrpcRequestProtoSchema(String grpcRequestProtoSchema) { this.grpcRequestProtoSchema = grpcRequestProtoSchema; return this; } + /** + * Sets the response message protobuf class name. + * + * @param grpcResponseProtoSchema the response proto schema + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setGrpcResponseProtoSchema(String grpcResponseProtoSchema) { this.grpcResponseProtoSchema = grpcResponseProtoSchema; return this; } + /** + * Sets the fully-qualified gRPC method name to invoke. + * + * @param grpcMethodUrl the gRPC method URL + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setGrpcMethodUrl(String grpcMethodUrl) { this.grpcMethodUrl = grpcMethodUrl; return this; } + /** + * Sets the request body pattern. + * + * @param requestPattern the pattern populated with the request variables + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setRequestPattern(String requestPattern) { this.requestPattern = requestPattern; return this; } + /** + * Sets the comma-separated input columns substituted into the request pattern. + * + * @param requestVariables the input column names + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setRequestVariables(String requestVariables) { this.requestVariables = requestVariables; return this; } + /** + * Sets the output column to JSON-path mapping. + * + * @param outputMapping the mapping from output column name to its response path + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setOutputMapping(Map outputMapping) { this.outputMapping = outputMapping; return this; } + /** + * Sets the async-IO stream timeout. + * + * @param streamTimeout the stream timeout in milliseconds + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setStreamTimeout(String streamTimeout) { this.streamTimeout = streamTimeout; return this; } + /** + * Sets the connection timeout. + * + * @param connectTimeout the connect timeout in milliseconds + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setConnectTimeout(String connectTimeout) { this.connectTimeout = connectTimeout; return this; } + /** + * Sets whether a failed lookup should fail the job. + * + * @param failOnErrors {@code true} to fail the job on lookup errors + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setFailOnErrors(boolean failOnErrors) { this.failOnErrors = failOnErrors; return this; } + /** + * Sets the comma-separated stencil URLs used to resolve the request and response descriptors. + * + * @param grpcStencilUrl the gRPC stencil URLs + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setGrpcStencilUrl(String grpcStencilUrl) { this.grpcStencilUrl = grpcStencilUrl; return this; } + /** + * Sets the optional output protobuf class name. + * + * @param type the protobuf class name used to type the response + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setType(String type) { this.type = type; return this; } + /** + * Sets whether raw response values are kept without type coercion. + * + * @param retainResponseType {@code true} to retain the raw response type + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setRetainResponseType(boolean retainResponseType) { this.retainResponseType = retainResponseType; return this; } + /** + * Sets the gRPC call metadata headers. + * + * @param headers the headers to attach to each call + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setHeaders(Map headers) { this.headers = headers; return this; } + /** + * Sets the optional metric id grouping this source's metrics. + * + * @param metricId the metric id + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setMetricId(String metricId) { this.metricId = metricId; return this; } + /** + * Sets the maximum number of concurrent async requests. + * + * @param capacity the async capacity + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setCapacity(int capacity) { this.capacity = capacity; return this; } + /** + * Sets the channel keepalive ping interval. + * + * @param grpcArgKeepaliveTimeMs the keepalive time in milliseconds + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setGrpcArgKeepaliveTimeMs(String grpcArgKeepaliveTimeMs) { this.grpcArgKeepaliveTimeMs = grpcArgKeepaliveTimeMs; return this; } + /** + * Sets the channel keepalive ping timeout. + * + * @param grpcArgKeepaliveTimeoutMs the keepalive timeout in milliseconds + * @return this builder for chaining + */ public GrpcSourceConfigBuilder setGrpcArgKeepaliveTimeoutMs(String grpcArgKeepaliveTimeoutMs) { this.grpcArgKeepaliveTimeoutMs = grpcArgKeepaliveTimeoutMs; return this; } + /** + * Builds a {@link GrpcSourceConfig} from the values accumulated in this builder. + * + * @return the assembled gRPC source config + */ public GrpcSourceConfig createGrpcSourceConfig() { return new GrpcSourceConfig(endpoint, servicePort, grpcRequestProtoSchema, grpcResponseProtoSchema, grpcMethodUrl, grpcArgKeepaliveTimeMs, grpcArgKeepaliveTimeoutMs, requestPattern, requestVariables, streamTimeout, connectTimeout, failOnErrors, grpcStencilUrl, type, retainResponseType, headers, outputMapping, metricId, capacity); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcStreamDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcStreamDecorator.java index 4fd636c34..e094b6af7 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcStreamDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/GrpcStreamDecorator.java @@ -14,8 +14,17 @@ */ public class GrpcStreamDecorator implements StreamDecorator { + /** + * The gRPC source configuration passed to the connector created by this decorator. + */ private GrpcSourceConfig grpcSourceConfig; + /** + * The metric configuration applied to the connector, including telemetry and the metric id. + */ private final ExternalMetricConfig externalMetricConfig; + /** + * The schema configuration providing column and proto metadata to the connector. + */ private final SchemaConfig schemaConfig; @@ -33,11 +42,25 @@ public GrpcStreamDecorator(GrpcSourceConfig grpcSourceConfig, ExternalMetricConf } + /** + * {@inheritDoc} + * + * @return {@code true} when a gRPC source configuration is present + */ @Override public Boolean canDecorate() { return grpcSourceConfig != null; } + /** + * {@inheritDoc} + * + *

Wraps the input stream in an ordered asynchronous gRPC lookup, registering the telemetry subscriber + * and applying the configured stream timeout and capacity. + * + * @param inputStream the stream to enrich with gRPC lookups + * @return the asynchronously enriched stream + */ @Override public DataStream decorate(DataStream inputStream) { GrpcAsyncConnector grpcAsyncConnector = new GrpcAsyncConnector(grpcSourceConfig, externalMetricConfig, schemaConfig); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/DynamicMessageMarshaller.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/DynamicMessageMarshaller.java index 1147cf0c9..b214b6515 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/DynamicMessageMarshaller.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/DynamicMessageMarshaller.java @@ -13,6 +13,9 @@ * A {@link Marshaller} for dynamic messages. */ public class DynamicMessageMarshaller implements Marshaller { + /** + * The protobuf descriptor describing the dynamic message type marshalled by this instance. + */ private final Descriptor messageDescriptor; /** @@ -24,6 +27,15 @@ public DynamicMessageMarshaller(Descriptor messageDescriptor) { this.messageDescriptor = messageDescriptor; } + /** + * {@inheritDoc} + * + *

Reads a {@link DynamicMessage} of the configured type from the supplied stream. + * + * @param inputStream the stream carrying the serialized protobuf message + * @return the parsed dynamic message + * @throws RuntimeException if the message cannot be read from the stream + */ @Override public DynamicMessage parse(InputStream inputStream) { try { @@ -35,6 +47,14 @@ public DynamicMessage parse(InputStream inputStream) { } } + /** + * {@inheritDoc} + * + *

Serializes the given message into a stream over its protobuf byte representation. + * + * @param abstractMessage the dynamic message to serialize + * @return a stream over the serialized message bytes + */ @Override public InputStream stream(DynamicMessage abstractMessage) { return abstractMessage.toByteString().newInput(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcClient.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcClient.java index 8b6b70764..910ed5c33 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcClient.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcClient.java @@ -24,12 +24,24 @@ * The Grpc client. */ public class GrpcClient { + /** + * The gRPC source configuration describing the endpoint, headers, and keepalive settings. + */ private final GrpcSourceConfig grpcConfig; + /** + * The managed channel to the gRPC service, created lazily and decorated with headers and keepalive options. + */ private ManagedChannel decoratedChannel; + /** + * The default keepalive timeout in milliseconds, used when none is configured. + */ private final long defaultKeepAliveTimeout = 20000L; + /** + * The default keepalive ping interval in milliseconds, used when none is configured. + */ private final long defaultKeepAliveInterval = Long.MAX_VALUE; /** @@ -50,6 +62,12 @@ public void addChannel() { decoratedChannel = channelBuilder.build(); } + /** + * Applies keepalive settings and optional metadata headers to the given channel builder. + * + * @param channelBuilder the channel builder to decorate + * @return the decorated channel builder, configured with keepalive options and any header interceptors + */ protected ManagedChannelBuilder decorateManagedChannelBuilder(ManagedChannelBuilder channelBuilder) { long keepAliveInterval = StringUtils.isNotEmpty(grpcConfig.getGrpcArgKeepaliveTimeMs()) ? Long.parseLong(grpcConfig.getGrpcArgKeepaliveTimeMs()) : defaultKeepAliveInterval; @@ -91,6 +109,14 @@ public void asyncUnaryCall( responseObserver); } + /** + * Creates a unary client call bound to the configured method using dynamic-message marshallers. + * + * @param callOptions the call options to apply to the call + * @param inputDescriptor the descriptor used to marshal the request message + * @param outputDescriptor the descriptor used to unmarshal the response message + * @return a client call for the configured gRPC method + */ private ClientCall createCall(CallOptions callOptions, Descriptor inputDescriptor, Descriptor outputDescriptor) { return decoratedChannel.newCall(MethodDescriptor.newBuilder(new DynamicMessageMarshaller(inputDescriptor), new DynamicMessageMarshaller(outputDescriptor)) diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcRequestHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcRequestHandler.java index 1e09e82ee..f40877877 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcRequestHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/grpc/client/GrpcRequestHandler.java @@ -11,7 +11,13 @@ */ public class GrpcRequestHandler { + /** + * The gRPC source configuration providing the request pattern and request proto schema. + */ private GrpcSourceConfig grpcSourceConfig; + /** + * Resolver used to look up the protobuf descriptor for the request message. + */ private DescriptorManager descriptorManager; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpAsyncConnector.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpAsyncConnector.java index f7b1592cd..67a6b0b8f 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpAsyncConnector.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpAsyncConnector.java @@ -37,9 +37,21 @@ */ public class HttpAsyncConnector extends AsyncConnector { + /** + * Logger used to record connector lifecycle events such as connection closure. + */ private static final Logger LOGGER = LoggerFactory.getLogger(HttpAsyncConnector.class.getName()); + /** + * The asynchronous HTTP client used to execute the outbound enrichment requests. + */ private AsyncHttpClient httpClient; + /** + * Configuration describing the endpoint, verb, request/header patterns and output mapping for this connector. + */ private HttpSourceConfig httpSourceConfig; + /** + * The set of HTTP status codes excluded from triggering a fatal failure even when fail-on-errors is enabled. + */ private Set failOnErrorsExclusionSet; /** @@ -83,6 +95,12 @@ AsyncHttpClient getHttpClient() { return httpClient; } + /** + * {@inheritDoc} + * + *

Lazily creates the underlying {@link AsyncHttpClient} when one has not already been injected, + * applying the connect timeout taken from the {@code HttpSourceConfig}. + */ @Override protected void createClient() { if (httpClient == null) { @@ -90,12 +108,29 @@ protected void createClient() { } } + /** + * {@inheritDoc} + * + *

Opens the connector by delegating to the superclass and then computing the set of status codes + * that are excluded from fail-on-errors handling from the configured code ranges. + * + * @param configuration the Flink {@code Configuration} supplied when the async operator is opened + * @throws Exception if the superclass fails to initialise the connector + */ @Override public void open(Configuration configuration) throws Exception { super.open(configuration); setFailOnErrorsExclusionSet(httpSourceConfig.getExcludeFailOnErrorsCodeRange()); } + /** + * {@inheritDoc} + * + *

Closes the underlying HTTP client, clears the reference, records a close-connection metric and + * logs that the connector has been shut down. + * + * @throws Exception if closing the underlying HTTP client fails + */ @Override public void close() throws Exception { httpClient.close(); @@ -104,6 +139,18 @@ public void close() throws Exception { LOGGER.error("HTTP Connector : Connection closed"); } + /** + * {@inheritDoc} + * + *

Resolves the request, dynamic header and endpoint variable values from the incoming row, validates + * them and, when valid, builds the request via {@link HttpRequestFactory} and executes it asynchronously. + * A {@link HttpResponseHandler} completes the {@code resultFuture} once a response or error is received. + * An unsupported HTTP verb is recorded as an invalid-configuration metric and completes the future + * exceptionally. + * + * @param input the input {@link Row} carrying the values used to populate the request + * @param resultFuture the future completed with the enriched output row or an error + */ @Override protected void process(Row input, ResultFuture resultFuture) { try { @@ -130,10 +177,25 @@ protected void process(Row input, ResultFuture resultFuture) { } + /** + * Returns the set of HTTP status codes excluded from fail-on-errors handling. + * + * @return the {@code Set} of status codes for which the connector will not fail even when + * fail-on-errors is enabled + */ protected Set getFailOnErrorsExclusionSet() { return failOnErrorsExclusionSet; } + /** + * Parses the configured comma-separated, hyphen-delimited status code ranges into the exclusion set. + * + *

For example {@code "500-502,504"} expands to the codes {@code 500}, {@code 501}, {@code 502} and + * {@code 504}. A {@code null} or empty value leaves the exclusion set empty. + * + * @param excludeFailOnErrorsCodeRange the raw configuration string describing status code ranges, + * which may be {@code null} or empty + */ private void setFailOnErrorsExclusionSet(String excludeFailOnErrorsCodeRange) { failOnErrorsExclusionSet = new HashSet(); if (!StringUtil.isNullOrEmpty(excludeFailOnErrorsCodeRange)) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpResponseHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpResponseHandler.java index 64a3abd91..6571f6abc 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpResponseHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpResponseHandler.java @@ -32,18 +32,54 @@ * The Http response handler. */ public class HttpResponseHandler extends AsyncCompletionHandler { + /** + * Logger used to record response failures and path resolution errors. + */ private static final Logger LOGGER = LoggerFactory.getLogger(HttpResponseHandler.class.getName()); + /** + * Regular expression matching any HTTP 2xx status code, used to classify a response as successful. + */ protected static final String SUCCESS_CODE_PATTERN = "^2.*"; + /** + * Manages the input and output {@link Row} so resolved values can be written back to the output row. + */ private final RowManager rowManager; + /** + * Resolves output column names to their positional indexes within the output row. + */ private ColumnNameManager columnNameManager; + /** + * Protobuf descriptor describing the output message type, used for type-aware field conversion. + */ private Descriptors.Descriptor descriptor; + /** + * The future completed with the enriched output row once the response has been processed. + */ private ResultFuture resultFuture; + /** + * Configuration describing the output mapping, type handling and error behaviour for the call. + */ private HttpSourceConfig httpSourceConfig; + /** + * The set of status codes excluded from triggering a fatal failure when fail-on-errors is enabled. + */ private Set failOnErrorsExclusionSet; + /** + * Manager used to emit success, failure and error metrics for the external call. + */ private MeterStatsManager meterStatsManager; + /** + * The instant at which the request was dispatched, used to compute response latency telemetry. + */ private Instant startTime; + /** + * Reporter used to surface fatal and non-fatal exceptions raised while handling the response. + */ private ErrorReporter errorReporter; + /** + * Helper that emits post-response telemetry such as success, failure and latency events. + */ private PostResponseTelemetry postResponseTelemetry; @@ -82,6 +118,15 @@ public void startTimer() { startTime = Instant.now(); } + /** + * {@inheritDoc} + * + *

Invoked when the asynchronous HTTP call completes. Responses with a 2xx status code are passed to + * the success handler; all other codes are recorded as telemetry and routed to the failure handler. + * + * @param response the HTTP {@code Response} returned by the external service + * @return the original {@code Response} object + */ @Override public Object onCompleted(Response response) { int statusCode = response.getStatusCode(); @@ -95,6 +140,14 @@ public Object onCompleted(Response response) { return response; } + /** + * {@inheritDoc} + * + *

Invoked when the asynchronous HTTP call fails with an exception. Records an other-errors metric and + * delegates to the failure handler with a status code of {@code 0}. + * + * @param t the throwable raised while performing the request + */ @Override public void onThrowable(Throwable t) { t.printStackTrace(); @@ -102,6 +155,15 @@ public void onThrowable(Throwable t) { failureHandler(t.getMessage(), 0); } + /** + * Extracts the configured fields from a successful response and completes the result future. + * + *

For each configured output mapping the value is read from the response body using its JsonPath, + * written into the output row at the resolved column index and, on success, success telemetry is sent. + * A missing path records a failure metric and reports the error. + * + * @param response the successful HTTP {@code Response} whose body is parsed for output values + */ private void successHandler(Response response) { Map outputMappings = httpSourceConfig.getOutputMapping(); ArrayList outputMappingKeys = new ArrayList<>(outputMappings.keySet()); @@ -142,6 +204,13 @@ public void failureHandler(String logMessage, Integer statusCode) { resultFuture.complete(Collections.singleton(rowManager.getAll())); } + /** + * Determines whether the given status code should be treated as a fatal failure. + * + * @param statusCode the HTTP status code of the response, or {@code 0} when the call threw an exception + * @return {@code true} when fail-on-errors is enabled and the code is either {@code 0} or not present in + * the exclusion set; {@code false} otherwise + */ private boolean shouldFailOnError(Integer statusCode) { if (httpSourceConfig.isFailOnErrors() && (statusCode == 0 || !failOnErrorsExclusionSet.contains(statusCode))) { return true; @@ -149,6 +218,16 @@ private boolean shouldFailOnError(Integer statusCode) { return false; } + /** + * Writes a resolved response value into the output row at the given index. + * + *

When the response type is not retained or a type is configured the value is converted using the + * matching type handler; otherwise the raw value is written directly. + * + * @param key the output field name the value maps to + * @param value the value read from the response body + * @param fieldIndex the index within the output row at which to store the value + */ private void setField(String key, Object value, int fieldIndex) { if (!httpSourceConfig.isRetainResponseType() || httpSourceConfig.hasType()) { setFieldUsingType(key, value, fieldIndex); @@ -157,6 +236,16 @@ private void setField(String key, Object value, int fieldIndex) { } } + /** + * Converts a response value using the protobuf field's type handler before writing it to the output row. + * + *

Resolves the field descriptor for {@code key} from the output descriptor and throws if it is not + * found, then applies the corresponding type handler transformation. + * + * @param key the output field name used to look up the field descriptor + * @param value the value read from the response body + * @param fieldIndex the index within the output row at which to store the converted value + */ private void setFieldUsingType(String key, Object value, Integer fieldIndex) { Descriptors.FieldDescriptor fieldDescriptor = null; try { @@ -172,6 +261,11 @@ private void setFieldUsingType(String key, Object value, Integer fieldIndex) { } + /** + * Reports the given exception as fatal and completes the result future exceptionally. + * + * @param e the exception to report and propagate to the result future + */ private void reportAndThrowError(Exception e) { errorReporter.reportFatalException(e); resultFuture.completeExceptionally(e); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpSourceConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpSourceConfig.java index a775452fe..f02fed910 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpSourceConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpSourceConfig.java @@ -16,25 +16,76 @@ * A class that holds Http configuration. */ public class HttpSourceConfig implements Serializable, SourceConfig { + /** + * The base HTTP endpoint URL, optionally containing format placeholders for endpoint variables. + */ private String endpoint; + /** + * Comma-separated input columns whose values are substituted into the endpoint placeholders. + */ private String endpointVariables; + /** + * The HTTP verb (for example {@code GET}, {@code POST} or {@code PUT}) used for the request. + */ private String verb; + /** + * Format pattern used to build the request path or body from the request variables. + */ private String requestPattern; + /** + * Comma-separated input columns whose values populate the request pattern. + */ private String requestVariables; + /** + * Format pattern used to build the dynamic request headers. + */ private String headerPattern; + /** + * Comma-separated input columns whose values populate the header pattern. + */ private String headerVariables; + /** + * Maximum time, in milliseconds, to wait for the asynchronous response before timing out. + */ private String streamTimeout; + /** + * Maximum time, in milliseconds, to wait while establishing the connection. + */ private String connectTimeout; + /** + * Whether a failed call should raise a fatal error instead of being reported as non-fatal. + */ private boolean failOnErrors; + /** + * Comma-separated, hyphen-delimited status code ranges excluded from fail-on-errors handling. + */ private String excludeFailOnErrorsCodeRange; + /** + * Fully qualified protobuf message type used for type-aware conversion of the response. + */ @SerializedName(value = "type", alternate = {"Type", "TYPE"}) private String type; + /** + * Maximum number of concurrent asynchronous requests allowed for this connector. + */ private String capacity; + /** + * Static headers added to every request, keyed by header name. + */ @SerializedName(value = "headers", alternate = {"Headers", "HEADERS"}) private Map headers; + /** + * Mapping of output column names to the configuration describing how to extract their value from the response. + */ private Map outputMapping; + /** + * Identifier used to tag the metrics emitted for this external source. + */ @SerializedName(value = "metricId", alternate = {"MetricId", "METRICID"}) private String metricId; + /** + * Whether the raw response value should be retained as-is instead of being converted to the configured type. + */ private boolean retainResponseType; /** @@ -142,11 +193,25 @@ public String getHeaderVariables() { return headerVariables; } + /** + * {@inheritDoc} + * + *

For an HTTP source the pattern is the configured request pattern. + * + * @return the request format pattern + */ @Override public String getPattern() { return requestPattern; } + /** + * {@inheritDoc} + * + *

For an HTTP source the variables are the configured request variables. + * + * @return the comma-separated request variable columns + */ @Override public String getVariables() { return requestVariables; @@ -161,6 +226,11 @@ public Integer getStreamTimeout() { return Integer.valueOf(streamTimeout); } + /** + * Returns whether the connector should fail fatally when the call errors. + * + * @return {@code true} if failures should be treated as fatal, {@code false} otherwise + */ public boolean isFailOnErrors() { return failOnErrors; } @@ -175,11 +245,21 @@ public String getExcludeFailOnErrorsCodeRange() { } + /** + * {@inheritDoc} + * + * @return the metric identifier configured for this source + */ @Override public String getMetricId() { return metricId; } + /** + * Gets the configured protobuf message type. + * + * @return the fully qualified type name, or {@code null} when no type is configured + */ public String getType() { return type; } @@ -211,11 +291,26 @@ public Map getOutputMapping() { return outputMapping; } + /** + * {@inheritDoc} + * + *

The output columns are derived from the keys of the configured output mapping. + * + * @return the {@code List} of output column names + */ @Override public List getOutputColumns() { return new ArrayList<>(outputMapping.keySet()); } + /** + * Builds the map of fields that must be present for the configuration to be valid. + * + *

The returned map is used by validation to ensure required settings such as the endpoint, verb, + * patterns, timeouts and output mapping have been supplied. + * + * @return a {@code HashMap} of mandatory field names to their configured values + */ public HashMap getMandatoryFields() { HashMap mandatoryFields = new HashMap<>(); mandatoryFields.put("endpoint", endpoint); @@ -249,6 +344,14 @@ public boolean isRetainResponseType() { return retainResponseType; } + /** + * {@inheritDoc} + * + *

Two configs are equal when all of their configuration fields are equal. + * + * @param o the object to compare with this configuration + * @return {@code true} if the given object is an equivalent {@code HttpSourceConfig}, {@code false} otherwise + */ @Override public boolean equals(Object o) { if (this == o) { @@ -261,6 +364,11 @@ public boolean equals(Object o) { return failOnErrors == that.failOnErrors && excludeFailOnErrorsCodeRange == that.excludeFailOnErrorsCodeRange && retainResponseType == that.retainResponseType && Objects.equals(endpoint, that.endpoint) && Objects.equals(verb, that.verb) && Objects.equals(requestPattern, that.requestPattern) && Objects.equals(requestVariables, that.requestVariables) && Objects.equals(headerPattern, that.headerPattern) && Objects.equals(headerVariables, that.headerVariables) && Objects.equals(streamTimeout, that.streamTimeout) && Objects.equals(connectTimeout, that.connectTimeout) && Objects.equals(type, that.type) && Objects.equals(capacity, that.capacity) && Objects.equals(headers, that.headers) && Objects.equals(outputMapping, that.outputMapping) && Objects.equals(metricId, that.metricId); } + /** + * {@inheritDoc} + * + * @return a hash code derived from all configuration fields + */ @Override public int hashCode() { return Objects.hash(endpoint, endpointVariables, verb, requestPattern, requestVariables, headerPattern, headerVariables, streamTimeout, connectTimeout, failOnErrors, excludeFailOnErrorsCodeRange, type, capacity, headers, outputMapping, metricId, retainResponseType); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpStreamDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpStreamDecorator.java index 78d8c6820..ac90630d7 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpStreamDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/HttpStreamDecorator.java @@ -15,8 +15,17 @@ */ public class HttpStreamDecorator implements StreamDecorator { + /** + * Configuration describing the HTTP endpoint, verb and output mapping to enrich the stream with. + */ private final HttpSourceConfig httpSourceConfig; + /** + * Metric configuration carrying the telemetry settings shared with the created connector. + */ private final ExternalMetricConfig externalMetricConfig; + /** + * Schema configuration providing the descriptors and column metadata for the stream. + */ private final SchemaConfig schemaConfig; /** @@ -32,11 +41,28 @@ public HttpStreamDecorator(HttpSourceConfig httpSourceConfig, ExternalMetricConf this.schemaConfig = schemaConfig; } + /** + * {@inheritDoc} + * + *

An HTTP decorator can decorate the stream only when an {@code HttpSourceConfig} has been configured. + * + * @return {@code true} when the HTTP source config is present, {@code false} otherwise + */ @Override public Boolean canDecorate() { return httpSourceConfig != null; } + /** + * {@inheritDoc} + * + *

Wraps the input stream in an order-preserving asynchronous operator backed by a + * {@link HttpAsyncConnector}, using the configured stream timeout and capacity. The connector is + * subscribed to the telemetry subscriber before the operator is created. + * + * @param inputStream the stream of {@link Row} records to enrich with HTTP responses + * @return the decorated stream emitting enriched {@link Row} records + */ @Override public DataStream decorate(DataStream inputStream) { HttpAsyncConnector httpAsyncConnector = new HttpAsyncConnector(httpSourceConfig, externalMetricConfig, schemaConfig); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpGetRequestHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpGetRequestHandler.java index 4e5cb064d..43b4a0976 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpGetRequestHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpGetRequestHandler.java @@ -15,10 +15,25 @@ * The Http get request handler. */ public class HttpGetRequestHandler implements HttpRequestHandler { + /** + * Configuration describing the endpoint, verb, patterns and headers for the request. + */ private HttpSourceConfig httpSourceConfig; + /** + * The asynchronous HTTP client used to prepare the request builder. + */ private AsyncHttpClient httpClient; + /** + * Resolved values substituted into the request pattern that forms the request path. + */ private Object[] requestVariablesValues; + /** + * Resolved values substituted into the dynamic header pattern. + */ private Object[] dynamicHeaderVariablesValues; + /** + * Resolved values substituted into the endpoint placeholders. + */ private Object[] endpointVariablesValues; /** @@ -37,6 +52,16 @@ public HttpGetRequestHandler(HttpSourceConfig httpSourceConfig, AsyncHttpClient this.endpointVariablesValues = endpointVariablesValues; } + /** + * {@inheritDoc} + * + *

Builds a GET request by formatting the endpoint and request path from their variable values and + * applying the static headers together with any dynamic headers produced from the header pattern. + * + * @return the prepared {@code BoundRequestBuilder} for the GET request + * @throws InvalidConfigurationException if the header pattern is invalid or incompatible with the + * configured header variables + */ @Override public BoundRequestBuilder create() { String endpointPath = String.format(httpSourceConfig.getPattern(), requestVariablesValues); @@ -58,6 +83,13 @@ public BoundRequestBuilder create() { return addHeaders(getRequest, headers); } + /** + * {@inheritDoc} + * + *

This handler can create the request when the configured verb is {@code GET}. + * + * @return {@code true} when the configured verb is {@code GET} (case-insensitive), {@code false} otherwise + */ @Override public boolean canCreate() { return httpSourceConfig.getVerb().equalsIgnoreCase("get"); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPostRequestHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPostRequestHandler.java index b675b786e..e10c71b77 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPostRequestHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPostRequestHandler.java @@ -15,10 +15,25 @@ * The Http post request handler. */ public class HttpPostRequestHandler implements HttpRequestHandler { + /** + * Configuration describing the endpoint, verb, patterns and headers for the request. + */ private HttpSourceConfig httpSourceConfig; + /** + * The asynchronous HTTP client used to prepare the request builder. + */ private AsyncHttpClient httpClient; + /** + * Resolved values substituted into the request pattern that forms the request body. + */ private Object[] requestVariablesValues; + /** + * Resolved values substituted into the dynamic header pattern. + */ private Object[] dynamicHeaderVariablesValues; + /** + * Resolved values substituted into the endpoint placeholders. + */ private Object[] endpointVariablesValues; /** * Instantiates a new Http post request handler. @@ -36,6 +51,16 @@ public HttpPostRequestHandler(HttpSourceConfig httpSourceConfig, AsyncHttpClient this.endpointVariablesValues = endpointVariablesValues; } + /** + * {@inheritDoc} + * + *

Builds a POST request by formatting the request body and endpoint from their variable values and + * applying the static headers together with any dynamic headers produced from the header pattern. + * + * @return the prepared {@code BoundRequestBuilder} for the POST request + * @throws InvalidConfigurationException if the header pattern is invalid or incompatible with the + * configured header variables + */ @Override public BoundRequestBuilder create() { String requestBody = String.format(httpSourceConfig.getPattern(), requestVariablesValues); @@ -58,6 +83,13 @@ public BoundRequestBuilder create() { return addHeaders(postRequest, headers); } + /** + * {@inheritDoc} + * + *

This handler can create the request when the configured verb is {@code POST}. + * + * @return {@code true} when the configured verb is {@code POST} (case-insensitive), {@code false} otherwise + */ @Override public boolean canCreate() { return httpSourceConfig.getVerb().equalsIgnoreCase("post"); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPutRequestHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPutRequestHandler.java index 16988dff4..75378236b 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPutRequestHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/http/request/HttpPutRequestHandler.java @@ -15,10 +15,25 @@ * The Http post request handler. */ public class HttpPutRequestHandler implements HttpRequestHandler { + /** + * Configuration describing the endpoint, verb, patterns and headers for the request. + */ private HttpSourceConfig httpSourceConfig; + /** + * The asynchronous HTTP client used to prepare the request builder. + */ private AsyncHttpClient httpClient; + /** + * Resolved values substituted into the request pattern that forms the request body. + */ private Object[] requestVariablesValues; + /** + * Resolved values substituted into the dynamic header pattern. + */ private Object[] dynamicHeaderVariablesValues; + /** + * Resolved values substituted into the endpoint placeholders. + */ private Object[] endpointVariablesValues; /** * Instantiates a new Http post request handler. @@ -36,6 +51,16 @@ public HttpPutRequestHandler(HttpSourceConfig httpSourceConfig, AsyncHttpClient this.endpointVariablesValues = endpointVariablesValues; } + /** + * {@inheritDoc} + * + *

Builds a PUT request by formatting the request body and endpoint from their variable values and + * applying the static headers together with any dynamic headers produced from the header pattern. + * + * @return the prepared {@code BoundRequestBuilder} for the PUT request + * @throws InvalidConfigurationException if the header pattern is invalid or incompatible with the + * configured header variables + */ @Override public BoundRequestBuilder create() { String requestBody = String.format(httpSourceConfig.getPattern(), requestVariablesValues); @@ -58,6 +83,13 @@ public BoundRequestBuilder create() { return addHeaders(putRequest, headers); } + /** + * {@inheritDoc} + * + *

This handler can create the request when the configured verb is {@code PUT}. + * + * @return {@code true} when the configured verb is {@code PUT} (case-insensitive), {@code false} otherwise + */ @Override public boolean canCreate() { return httpSourceConfig.getVerb().equalsIgnoreCase("put"); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgAsyncConnector.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgAsyncConnector.java index 050656f56..1a020e233 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgAsyncConnector.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgAsyncConnector.java @@ -29,8 +29,17 @@ * The Postgre async connector. */ public class PgAsyncConnector extends AsyncConnector { + /** + * Logger used to record connection pool lifecycle events. + */ private static final Logger LOGGER = LoggerFactory.getLogger(PgAsyncConnector.class.getName()); + /** + * Configuration describing the Postgres connection, pool sizing, query pattern and output mapping. + */ private final PgSourceConfig pgSourceConfig; + /** + * The Vert.x Postgres connection pool used to execute lookup queries. + */ private PgPool pgClient; /** @@ -63,6 +72,12 @@ public PgAsyncConnector(PgSourceConfig pgSourceConfig, ExternalMetricConfig exte this.pgSourceConfig = pgSourceConfig; } + /** + * {@inheritDoc} + * + *

Lazily builds the Vert.x {@code PgPool} from the configured host, port, database, credentials and + * timeouts when a client has not already been injected. + */ @Override protected void createClient() { if (pgClient == null) { @@ -82,6 +97,17 @@ protected void createClient() { } } + /** + * {@inheritDoc} + * + *

Resolves the query variable values from the incoming row, validates them and, when valid, formats + * the configured query pattern and executes it asynchronously. A {@link PgResponseHandler} completes the + * {@code resultFuture} with the enriched row. A query that cannot be created is recorded as an + * invalid-configuration metric and completes the future exceptionally. + * + * @param input the input {@link Row} carrying the values used to build the query + * @param resultFuture the future completed with the enriched output row or an error + */ @Override public void process(Row input, ResultFuture resultFuture) { RowManager rowManager = new RowManager(input); @@ -107,6 +133,12 @@ public void process(Row input, ResultFuture resultFuture) { } } + /** + * {@inheritDoc} + * + *

Closes the Postgres connection pool, clears the reference, records a close-connection metric and + * logs that the pool has been released. + */ @Override public void close() { pgClient.close(); @@ -115,6 +147,17 @@ public void close() { LOGGER.info("DB Connector : Connection pool released"); } + /** + * Creates a new Vert.x {@code PgPool} backed by a dedicated {@code Vertx} instance. + * + *

Fails when invoked from within an existing Vert.x context, and enables native transport when the + * connection uses a domain socket. + * + * @param connectOptions the Postgres connection options describing host, port, database and credentials + * @param poolOptions the pool options describing the maximum pool size + * @return a newly created {@code PgPool} bound to a fresh Vert.x context + * @throws IllegalStateException if called from within an existing Vert.x context + */ private PgPool pool(PgConnectOptions connectOptions, PoolOptions poolOptions) { if (Vertx.currentContext() != null) { throw new IllegalStateException("Running in a Vertx context => use PgPool#pool(Vertx, PgConnectOptions, PoolOptions) instead"); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgResponseHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgResponseHandler.java index 7c319fad6..cc959e810 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgResponseHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgResponseHandler.java @@ -30,15 +30,45 @@ * The Postgre response handler. */ public class PgResponseHandler implements Handler>> { + /** + * Logger used to record query failures encountered while handling the response. + */ private static final Logger LOGGER = LoggerFactory.getLogger(PgResponseHandler.class.getName()); + /** + * Configuration describing the output mapping, type handling and error behaviour for the query. + */ private final PgSourceConfig pgSourceConfig; + /** + * Manager used to emit success, failure and error metrics for the query. + */ private final MeterStatsManager meterStatsManager; + /** + * Manages the input and output {@link Row} so resolved values can be written back to the output row. + */ private final RowManager rowManager; + /** + * Resolves output column names to their positional indexes within the output row. + */ private final ColumnNameManager columnNameManager; + /** + * Protobuf descriptor describing the output message type, used for type-aware field conversion. + */ private final Descriptors.Descriptor outputDescriptor; + /** + * The future completed with the enriched output row once the query result has been processed. + */ private final ResultFuture resultFuture; + /** + * Reporter used to surface fatal and non-fatal exceptions raised while handling the result. + */ private final ErrorReporter errorReporter; + /** + * Helper that emits post-response telemetry such as success, failure and latency events. + */ private PostResponseTelemetry postResponseTelemetry; + /** + * The instant at which the query was dispatched, used to compute response latency telemetry. + */ private Instant startTime; /** @@ -72,6 +102,14 @@ public void startTimer() { startTime = Instant.now(); } + /** + * {@inheritDoc} + * + *

Invoked when the asynchronous query completes. A successful result is passed to the success handler; + * a failed result is routed to the failure handler. + * + * @param event the asynchronous result wrapping the query's {@code RowSet} or the failure cause + */ @Override public void handle(AsyncResult> event) { if (event.succeeded()) { @@ -81,6 +119,15 @@ public void handle(AsyncResult> event) { } } + /** + * Maps a successful query result onto the output row and completes the result future. + * + *

A result set containing more than one row is treated as a configuration error. For each configured + * output column the value is read from the mapped query parameter and written to the output row; a + * missing column is reported as an error. Success telemetry is sent once all columns have been mapped. + * + * @param resultRowSet the result set returned by the query, expected to contain at most one row + */ private void successHandler(RowSet resultRowSet) { if (resultRowSet.size() > 1) { meterStatsManager.markEvent(ExternalSourceAspects.INVALID_CONFIGURATION); @@ -112,6 +159,15 @@ private void successHandler(RowSet resultRowSet) { resultFuture.complete(Collections.singleton(rowManager.getAll())); } + /** + * Handles a failed query by emitting telemetry and reporting the error. + * + *

Sends failure telemetry and either reports a fatal error or a non-fatal exception depending on the + * fail-on-errors setting. A {@code ResponseException} contributes its status code to telemetry, while + * other errors are recorded as generic errors. The result future is completed with the unmodified row. + * + * @param e the throwable describing why the query failed + */ private void failureHandler(Throwable e) { postResponseTelemetry.sendFailureTelemetry(meterStatsManager, startTime); LOGGER.error(e.getMessage()); @@ -130,6 +186,18 @@ private void failureHandler(Throwable e) { resultFuture.complete(Collections.singleton(rowManager.getAll())); } + /** + * Writes a single query result value into the output row at the given index. + * + *

When the response type is not retained or a type is configured, map values are converted into a + * nested {@link Row} using the field descriptor while scalar values are converted with the matching type + * handler; otherwise the raw value is written directly. A missing field descriptor is reported as an + * error. + * + * @param index the index within the output row at which to store the value + * @param value the value read from the query result + * @param name the output field name used to look up the field descriptor + */ private void setField(int index, Object value, String name) { if (!pgSourceConfig.isRetainResponseType() || pgSourceConfig.hasType()) { Descriptors.FieldDescriptor fieldDescriptor = outputDescriptor.findFieldByName(name); @@ -150,6 +218,11 @@ private void setField(int index, Object value, String name) { } } + /** + * Reports the given exception as fatal and completes the result future exceptionally. + * + * @param exception the exception to report and propagate to the result future + */ private void reportAndThrowError(Exception exception) { errorReporter.reportFatalException(exception); resultFuture.completeExceptionally(exception); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfig.java index 8b8a665bd..caa6c2a82 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfig.java @@ -15,22 +15,70 @@ */ public class PgSourceConfig implements Serializable, SourceConfig { + /** + * The Postgres server host name. + */ private final String host; + /** + * The Postgres server port. + */ private final String port; + /** + * The user name used to authenticate with the database. + */ private final String user; + /** + * The password used to authenticate with the database. + */ private final String password; + /** + * The name of the database to connect to. + */ private final String database; + /** + * Fully qualified protobuf message type used for type-aware conversion of the result. + */ private final String type; + /** + * Maximum number of concurrent connections and asynchronous requests allowed. + */ private final String capacity; + /** + * Maximum time, in milliseconds, to wait for the asynchronous response before timing out. + */ private final String streamTimeout; + /** + * Mapping of output column names to the query column they are populated from. + */ private final Map outputMapping; + /** + * Maximum time, in milliseconds, to wait while establishing the connection. + */ private final String connectTimeout; + /** + * Maximum time a pooled connection may remain idle before being closed. + */ private final String idleTimeout; + /** + * Comma-separated input columns whose values populate the query pattern. + */ private final String queryVariables; + /** + * Format pattern used to build the SQL query from the query variables. + */ private final String queryPattern; + /** + * Whether a failed query should raise a fatal error instead of being reported as non-fatal. + */ private boolean failOnErrors; + /** + * Identifier used to tag the metrics emitted for this external source. + */ @SerializedName(value = "metricId", alternate = {"MetricId", "METRICID"}) private String metricId; + /** + * Whether the raw result value should be retained as-is instead of being converted to the configured type. + */ private boolean retainResponseType; /** @@ -73,11 +121,26 @@ public PgSourceConfig(String host, String port, String user, String password, St this.retainResponseType = retainResponseType; } + /** + * {@inheritDoc} + * + *

The output columns are derived from the keys of the configured output mapping. + * + * @return the {@code List} of output column names + */ @Override public List getOutputColumns() { return new ArrayList<>(outputMapping.keySet()); } + /** + * {@inheritDoc} + * + *

Collects the settings that must be present for the configuration to be valid, such as the host, + * port, credentials, database, capacity, timeouts, query pattern and output mapping. + * + * @return a {@code HashMap} of mandatory field names to their configured values + */ @Override public HashMap getMandatoryFields() { HashMap mandatoryFields = new HashMap<>(); @@ -187,11 +250,25 @@ public String getQueryVariables() { return queryVariables; } + /** + * {@inheritDoc} + * + *

For a Postgres source the pattern is the configured query pattern. + * + * @return the SQL query format pattern + */ @Override public String getPattern() { return queryPattern; } + /** + * {@inheritDoc} + * + *

For a Postgres source the variables are the configured query variables. + * + * @return the comma-separated query variable columns + */ @Override public String getVariables() { return queryVariables; @@ -206,6 +283,11 @@ public boolean hasType() { return StringUtils.isNotEmpty(type); } + /** + * Gets the configured protobuf message type. + * + * @return the fully qualified type name, or {@code null} when no type is configured + */ public String getType() { return type; } @@ -220,10 +302,20 @@ public String getMappedQueryParam(String outputColumn) { return outputMapping.get(outputColumn); } + /** + * Returns whether the connector should fail fatally when the query errors. + * + * @return {@code true} if failures should be treated as fatal, {@code false} otherwise + */ public boolean isFailOnErrors() { return failOnErrors; } + /** + * {@inheritDoc} + * + * @return the metric identifier configured for this source + */ @Override public String getMetricId() { return metricId; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfigBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfigBuilder.java index c348eafef..be0cc4dee 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfigBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgSourceConfigBuilder.java @@ -2,104 +2,229 @@ import java.util.Map; +/** + * Fluent builder for {@link PgSourceConfig}. + * + *

Collects the PostgreSQL connection settings, query pattern/variables, output mapping, timeouts, + * and failure-handling options through chained setters, then assembles an immutable + * {@link PgSourceConfig} via {@link #createPgSourceConfig()}. Useful for constructing the config + * programmatically (for example in tests) instead of deserializing it. + */ public class PgSourceConfigBuilder { + /** Hostname of the PostgreSQL server. */ private String host; + /** Port of the PostgreSQL server. */ private String port; + /** Username used to authenticate with the database. */ private String user; + /** Password used to authenticate with the database. */ private String password; + /** Name of the database to connect to. */ private String database; + /** Optional protobuf message type used to type-cast result values. */ private String type; + /** Maximum number of concurrent async queries and the connection-pool size. */ private String capacity; + /** Async-IO completion timeout in milliseconds. */ private String streamTimeout; + /** Mapping from each output column name to the result-set column supplying its value. */ private Map outputMapping; + /** Connection timeout in milliseconds. */ private String connectTimeout; + /** Idle-connection timeout in milliseconds. */ private String idleTimeout; + /** Comma-separated input columns whose values fill the query pattern. */ private String queryVariables; + /** Format template for the SQL query. */ private String queryPattern; + /** Whether a failed lookup should abort the job. */ private boolean failOnErrors; + /** Identifier used to tag the metrics emitted for this source. */ private String metricId; + /** Whether to keep the raw result value type instead of casting it. */ private boolean retainResponseType; + /** + * Sets the PostgreSQL server hostname. + * + * @param host the server hostname + * @return this builder for chaining + */ public PgSourceConfigBuilder setHost(String host) { this.host = host; return this; } + /** + * Sets the PostgreSQL server port. + * + * @param port the server port + * @return this builder for chaining + */ public PgSourceConfigBuilder setPort(String port) { this.port = port; return this; } + /** + * Sets the database username. + * + * @param user the database username + * @return this builder for chaining + */ public PgSourceConfigBuilder setUser(String user) { this.user = user; return this; } + /** + * Sets the database password. + * + * @param password the database password + * @return this builder for chaining + */ public PgSourceConfigBuilder setPassword(String password) { this.password = password; return this; } + /** + * Sets the database name to connect to. + * + * @param database the database name + * @return this builder for chaining + */ public PgSourceConfigBuilder setDatabase(String database) { this.database = database; return this; } + /** + * Sets the optional protobuf type used to cast result values. + * + * @param type the protobuf message type name + * @return this builder for chaining + */ public PgSourceConfigBuilder setType(String type) { this.type = type; return this; } + /** + * Sets the maximum number of concurrent queries and the connection-pool size. + * + * @param capacity the capacity value + * @return this builder for chaining + */ public PgSourceConfigBuilder setCapacity(String capacity) { this.capacity = capacity; return this; } + /** + * Sets the async-IO completion timeout in milliseconds. + * + * @param streamTimeout the stream timeout in milliseconds + * @return this builder for chaining + */ public PgSourceConfigBuilder setStreamTimeout(String streamTimeout) { this.streamTimeout = streamTimeout; return this; } + /** + * Sets the mapping from output column to result-set column. + * + * @param outputMapping the output column to result-set column mapping + * @return this builder for chaining + */ public PgSourceConfigBuilder setOutputMapping(Map outputMapping) { this.outputMapping = outputMapping; return this; } + /** + * Sets the connection timeout in milliseconds. + * + * @param connectTimeout the connect timeout in milliseconds + * @return this builder for chaining + */ public PgSourceConfigBuilder setConnectTimeout(String connectTimeout) { this.connectTimeout = connectTimeout; return this; } + /** + * Sets the idle-connection timeout in milliseconds. + * + * @param idleTimeout the idle timeout in milliseconds + * @return this builder for chaining + */ public PgSourceConfigBuilder setIdleTimeout(String idleTimeout) { this.idleTimeout = idleTimeout; return this; } + /** + * Sets the input columns that fill the query pattern. + * + * @param queryVariables the comma-separated query variable column names + * @return this builder for chaining + */ public PgSourceConfigBuilder setQueryVariables(String queryVariables) { this.queryVariables = queryVariables; return this; } + /** + * Sets the SQL query template. + * + * @param queryPattern the SQL query pattern + * @return this builder for chaining + */ public PgSourceConfigBuilder setQueryPattern(String queryPattern) { this.queryPattern = queryPattern; return this; } + /** + * Sets whether a lookup failure should abort the job. + * + * @param failOnErrors {@code true} to fail the job on query errors, {@code false} to tolerate them + * @return this builder for chaining + */ public PgSourceConfigBuilder setFailOnErrors(boolean failOnErrors) { this.failOnErrors = failOnErrors; return this; } + /** + * Sets the identifier used to tag this source's metrics. + * + * @param metricId the metric id + * @return this builder for chaining + */ public PgSourceConfigBuilder setMetricId(String metricId) { this.metricId = metricId; return this; } + /** + * Sets whether to keep the raw result type instead of casting it. + * + * @param retainResponseType {@code true} to store values as-is, {@code false} to cast to the protobuf type + * @return this builder for chaining + */ public PgSourceConfigBuilder setRetainResponseType(boolean retainResponseType) { this.retainResponseType = retainResponseType; return this; } + /** + * Builds an immutable {@link PgSourceConfig} from the values collected by this builder. + * + * @return the assembled Postgres source configuration + */ public PgSourceConfig createPgSourceConfig() { return new PgSourceConfig(host, port, user, password, database, type, capacity, streamTimeout, outputMapping, connectTimeout, idleTimeout, queryVariables, queryPattern, failOnErrors, metricId, retainResponseType); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgStreamDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgStreamDecorator.java index 453aa5bbb..4b40aef56 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgStreamDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/external/pg/PgStreamDecorator.java @@ -16,8 +16,17 @@ public class PgStreamDecorator implements StreamDecorator { + /** + * Configuration describing the Postgres connection, query and output mapping to enrich the stream with. + */ private final PgSourceConfig pgSourceConfig; + /** + * Metric configuration carrying the telemetry settings shared with the created connector. + */ private final ExternalMetricConfig externalMetricConfig; + /** + * Schema configuration providing the descriptors and column metadata for the stream. + */ private final SchemaConfig schemaConfig; /** @@ -33,11 +42,28 @@ public PgStreamDecorator(PgSourceConfig pgSourceConfig, ExternalMetricConfig ext this.schemaConfig = schemaConfig; } + /** + * {@inheritDoc} + * + *

A Postgres decorator can decorate the stream only when a {@code PgSourceConfig} has been configured. + * + * @return {@code true} when the Postgres source config is present, {@code false} otherwise + */ @Override public Boolean canDecorate() { return pgSourceConfig != null; } + /** + * {@inheritDoc} + * + *

Wraps the input stream in an order-preserving asynchronous operator backed by a + * {@link PgAsyncConnector}, using the configured stream timeout and capacity. The connector is + * subscribed to the telemetry subscriber before the operator is created. + * + * @param inputStream the stream of {@link Row} records to enrich with Postgres lookups + * @return the decorated stream emitting enriched {@link Row} records + */ @Override public DataStream decorate(DataStream inputStream) { PgAsyncConnector pgAsyncConnector = new PgAsyncConnector(pgSourceConfig, externalMetricConfig, schemaConfig); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalDecorator.java index eb65bf14e..6a94b4998 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalDecorator.java @@ -12,10 +12,17 @@ */ public class InternalDecorator implements MapDecorator { + /** + * The fixed index, within the wrapped Dagger record, of the output {@link Row} that the + * internal post processor reads from and writes its resolved values back into. + */ public static final int OUTPUT_ROW_INDEX = 1; + /** Configuration describing the single internal source/output mapping handled by this decorator. */ private InternalSourceConfig internalSourceConfig; + /** Strategy that resolves the configured value and writes it into the output row. */ private InternalConfigProcessor internalConfigProcessor; + /** Resolves logical column names to their input/output {@link Row} indices. */ private ColumnNameManager columnNameManager; /** @@ -31,11 +38,30 @@ public InternalDecorator(InternalSourceConfig internalSourceConfig, InternalConf this.columnNameManager = columnNameManager; } + /** + * Determines whether this decorator should be applied to the stream. + * + *

Decoration is skipped when no {@link InternalSourceConfig} was supplied, allowing the + * surrounding post processor to ignore unused internal mappings. + * + * @return {@code true} when an internal source config is present, {@code false} otherwise + */ @Override public Boolean canDecorate() { return internalSourceConfig != null; } + /** + * Applies the configured internal mapping to a single record and returns the enriched row. + * + *

When the existing output row (at index {@link #OUTPUT_ROW_INDEX}) no longer matches the + * managed output arity it is replaced with a fresh {@link Row} sized to the configured output + * columns. The mapping is then delegated to the {@link InternalConfigProcessor} through a + * {@link RowManager}, which populates the output row in place. + * + * @param input the incoming record holding both the input and output rows + * @return the record whose output row has been populated by the internal config processor + */ @Override public Row map(Row input) { Row outputRow = (Row) input.getField(OUTPUT_ROW_INDEX); @@ -47,6 +73,13 @@ public Row map(Row input) { return rowManager.getAll(); } + /** + * Checks whether an existing output row needs to be resized to match the managed columns. + * + * @param outputRow the current output row extracted from the record, possibly {@code null} + * @return {@code true} when {@code outputRow} is non-null and its arity differs from the + * configured output size, {@code false} otherwise + */ private boolean outputColumnSizeIsDifferent(Row outputRow) { return outputRow != null && outputRow.getArity() != columnNameManager.getOutputSize(); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalPostProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalPostProcessor.java index 2912d2405..13ccc422c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalPostProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalPostProcessor.java @@ -19,7 +19,9 @@ */ public class InternalPostProcessor implements PostProcessor { + /** The full post processor configuration; supplies the internal source mappings and output columns. */ private PostProcessorConfig postProcessorConfig; + /** Schema and runtime context (stencil client, configuration) shared with the produced decorators. */ private SchemaConfig schemaConfig; /** @@ -33,11 +35,28 @@ public InternalPostProcessor(PostProcessorConfig postProcessorConfig, SchemaConf this.schemaConfig = schemaConfig; } + /** + * Indicates whether this post processor is applicable to the supplied configuration. + * + * @param config the post processor configuration to inspect + * @return {@code true} when the configuration declares an internal source, {@code false} otherwise + */ @Override public boolean canProcess(PostProcessorConfig config) { return config.hasInternalSource(); } + /** + * Enriches the incoming stream by applying every configured internal source mapping in turn. + * + *

A {@link ColumnNameManager} is built from the stream's input columns and the configured + * output columns. Each {@link InternalSourceConfig} then contributes an {@link InternalDecorator} + * that is chained onto the stream, and the resulting {@link StreamInfo} carries the managed + * output column names. + * + * @param streamInfo the upstream stream together with its current column names + * @return a new {@link StreamInfo} wrapping the decorated stream and its output column names + */ @Override public StreamInfo process(StreamInfo streamInfo) { DataStream resultStream = streamInfo.getDataStream(); @@ -49,6 +68,14 @@ public StreamInfo process(StreamInfo streamInfo) { return new StreamInfo(resultStream, columnNameManager.getOutputColumnNames()); } + /** + * Validates a single internal config and appends its decorator to the stream. + * + * @param resultStream the stream to enrich + * @param configs the validator (an {@link InternalSourceConfig}) whose fields are checked first + * @param decorator the decorator that appends the internal mapping onto the stream + * @return the decorated stream + */ private DataStream enrichStream(DataStream resultStream, Validator configs, StreamDecorator decorator) { configs.validateFields(); return decorator.decorate(resultStream); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalSourceConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalSourceConfig.java index 7a9ee0a87..9150ea080 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalSourceConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/InternalSourceConfig.java @@ -10,11 +10,16 @@ * A class that holds internal post processor configuration. */ public class InternalSourceConfig implements Validator, Serializable { + /** Configuration key under which the nested internal-processor settings are supplied. */ public static final String INTERNAL_PROCESSOR_CONFIG_KEY = "internal_processor_config"; + /** Name of the output column this mapping writes its resolved value into. */ private String outputField; + /** The configured value or expression to resolve; its meaning depends on {@link #type}. */ private String value; + /** The processor type handling this mapping, for example {@code sql}, {@code function} or {@code constant}. */ private String type; + /** Additional key/value settings consumed by the selected internal processor. */ private Map internalProcessorConfig; /** @@ -32,6 +37,14 @@ public InternalSourceConfig(String outputField, String value, String type, MapDeclares {@code output_field}, {@code type} and {@code value} as mandatory so that + * validation fails fast when any of them is missing from the internal source config. + * + * @return a map of mandatory field names to their currently configured values + */ @Override public HashMap getMandatoryFields() { HashMap mandatoryFields = new HashMap<>(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/InternalConfigHandlerFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/InternalConfigHandlerFactory.java index 7f39d0c7d..c86e7a309 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/InternalConfigHandlerFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/InternalConfigHandlerFactory.java @@ -16,10 +16,27 @@ * The factory class for Internal config handler. */ public class InternalConfigHandlerFactory { + /** + * Prevents instantiation of this static factory. + * + * @throws IllegalStateException always, since the class only exposes static helpers + */ private InternalConfigHandlerFactory() { throw new IllegalStateException("Factory class"); } + /** + * Builds the ordered list of candidate internal config processors. + * + *

The handlers are evaluated in declaration order, so SQL handling takes precedence over + * function handling, which in turn precedes constant handling. + * + * @param columnNameManager resolves logical column names to row indices + * @param sqlPathParser extracts input data for SQL-type mappings + * @param internalSourceConfig the internal source configuration being handled + * @param schemaConfig the schema/runtime context passed to function processors + * @return the candidate processors to try, in priority order + */ private static List getHandlers(ColumnNameManager columnNameManager, SqlConfigTypePathParser sqlPathParser, InternalSourceConfig internalSourceConfig, SchemaConfig schemaConfig) { return Arrays.asList(new SqlInternalConfigProcessor(columnNameManager, sqlPathParser, internalSourceConfig), new FunctionInternalConfigProcessor(columnNameManager, internalSourceConfig, schemaConfig), diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/constant/ConstantInternalConfigProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/constant/ConstantInternalConfigProcessor.java index c7aeec1ea..fb10e1f71 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/constant/ConstantInternalConfigProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/constant/ConstantInternalConfigProcessor.java @@ -11,8 +11,11 @@ * The Constant internal config processor. */ public class ConstantInternalConfigProcessor implements InternalConfigProcessor, Serializable { + /** The configuration {@code type} value that selects this constant processor. */ public static final String CONSTANT_CONFIG_HANDLER_TYPE = "constant"; + /** Resolves the configured output column name to its index in the output row. */ private ColumnNameManager columnNameManager; + /** The internal source configuration supplying the output field and constant value. */ private InternalSourceConfig internalSourceConfig; /** @@ -26,11 +29,25 @@ public ConstantInternalConfigProcessor(ColumnNameManager columnNameManager, Inte this.internalSourceConfig = internalSourceConfig; } + /** + * Indicates whether this processor handles the supplied internal config type. + * + * @param type the configured internal source type + * @return {@code true} when {@code type} equals {@link #CONSTANT_CONFIG_HANDLER_TYPE}, {@code false} otherwise + */ @Override public boolean canProcess(String type) { return CONSTANT_CONFIG_HANDLER_TYPE.equals(type); } + /** + * Writes the configured constant value into the resolved output column of the record. + * + *

When the configured output field cannot be resolved to a column index the record is left + * unchanged. + * + * @param rowManager the row manager wrapping the record whose output row is updated + */ @Override public void process(RowManager rowManager) { int outputFieldIndex = columnNameManager.getOutputIndex(internalSourceConfig.getOutputField()); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionInternalConfigProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionInternalConfigProcessor.java index 6891b2e7d..338430080 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionInternalConfigProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionInternalConfigProcessor.java @@ -12,10 +12,14 @@ * The Function internal config processor. */ public class FunctionInternalConfigProcessor implements InternalConfigProcessor, Serializable { + /** The configuration {@code type} value that selects this function processor. */ public static final String FUNCTION_CONFIG_HANDLER_TYPE = "function"; + /** Resolves the configured output column name to its index in the output row. */ private ColumnNameManager columnNameManager; + /** The internal source configuration supplying the output field and function name. */ private InternalSourceConfig internalSourceConfig; + /** The concrete function selected for this mapping, resolved from the configured value. */ protected FunctionProcessor functionProcessor; /** @@ -31,11 +35,25 @@ public FunctionInternalConfigProcessor(ColumnNameManager columnNameManager, Inte this.functionProcessor = FunctionProcessorFactory.getFunctionProcessor(internalSourceConfig, schemaConfig); } + /** + * Indicates whether this processor handles the supplied internal config type. + * + * @param type the configured internal source type + * @return {@code true} when {@code type} equals {@link #FUNCTION_CONFIG_HANDLER_TYPE}, {@code false} otherwise + */ @Override public boolean canProcess(String type) { return FUNCTION_CONFIG_HANDLER_TYPE.equals(type); } + /** + * Evaluates the configured function and writes its result into the resolved output column. + * + *

When the configured output field cannot be resolved to a column index the record is left + * unchanged. + * + * @param rowManager the row manager wrapping the record read by the function and updated in place + */ @Override public void process(RowManager rowManager) { int outputFieldIndex = columnNameManager.getOutputIndex(internalSourceConfig.getOutputField()); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessor.java index e06d47909..cf64f1e91 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessor.java @@ -1,6 +1,13 @@ package com.gotocompany.dagger.core.processors.internal.processor.function; import com.gotocompany.dagger.core.processors.common.RowManager; +/** + * Strategy for computing the value of a {@code function}-typed internal post-processor mapping. + * + *

Implementations recognise a specific function name (such as {@code CURRENT_TIMESTAMP} or + * {@code JSON_PAYLOAD}) and derive the value written into the output row from the record exposed + * by the supplied {@link RowManager}. + */ public interface FunctionProcessor { /** * Check if function can be processed. diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessorFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessorFactory.java index 48da06ea2..301640dbd 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessorFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/FunctionProcessorFactory.java @@ -14,10 +14,25 @@ * The factory class for internal function post processors. */ public class FunctionProcessorFactory { + /** + * Prevents instantiation of this static factory. + * + * @throws IllegalStateException always, since the class only exposes static helpers + */ private FunctionProcessorFactory() { throw new IllegalStateException("Factory class"); } + /** + * Builds the ordered list of candidate function processors. + * + *

Functions are evaluated in declaration order; the system-default-zone {@link Clock} backs + * the {@link CurrentTimestampFunction}. + * + * @param internalSourceConfig the internal source configuration being handled + * @param schemaConfig the schema/runtime context passed to functions that need it + * @return the candidate function processors to try, in priority order + */ private static List getFunctions(InternalSourceConfig internalSourceConfig, SchemaConfig schemaConfig) { Clock clock = Clock.systemDefaultZone(); return Arrays.asList(new CurrentTimestampFunction(clock), diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/CurrentTimestampFunction.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/CurrentTimestampFunction.java index b177ab8db..c6c137009 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/CurrentTimestampFunction.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/CurrentTimestampFunction.java @@ -7,14 +7,34 @@ import java.io.Serializable; import java.time.Clock; +/** + * Internal post-processor function that yields the current wall-clock time. + * + *

Selected when an internal source of type {@code function} has the value + * {@code CURRENT_TIMESTAMP}. The injected {@link Clock} makes the produced value deterministic + * in tests. + */ public class CurrentTimestampFunction implements FunctionProcessor, Serializable { + /** The configured function value that selects this function. */ public static final String CURRENT_TIMESTAMP_FUNCTION_KEY = "CURRENT_TIMESTAMP"; + /** Clock used to read the current time; injected so the result can be controlled in tests. */ private Clock clock; + /** + * Instantiates a new current timestamp function. + * + * @param clock the clock supplying the current time + */ public CurrentTimestampFunction(Clock clock) { this.clock = clock; } + /** + * Indicates whether this function handles the supplied function name. + * + * @param functionName the configured function value to match + * @return {@code true} when {@code functionName} equals {@link #CURRENT_TIMESTAMP_FUNCTION_KEY}, {@code false} otherwise + */ @Override public boolean canProcess(String functionName) { return CURRENT_TIMESTAMP_FUNCTION_KEY.equals(functionName); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/InvalidFunction.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/InvalidFunction.java index 596c8d004..332d5f19b 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/InvalidFunction.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/InvalidFunction.java @@ -7,7 +7,14 @@ import com.gotocompany.dagger.core.processors.internal.InternalSourceConfig; import java.io.Serializable; +/** + * Fallback {@link FunctionProcessor} used when a configured function name is not recognised. + * + *

It never matches a function name and always fails fast on evaluation, surfacing the offending + * configuration through an {@link InvalidConfigurationException}. + */ public class InvalidFunction implements FunctionProcessor, Serializable { + /** The internal source configuration whose unsupported function value triggered this fallback. */ private InternalSourceConfig internalSourceConfig; /** @@ -19,11 +26,24 @@ public InvalidFunction(InternalSourceConfig internalSourceConfig) { this.internalSourceConfig = internalSourceConfig; } + /** + * Always reports that no function can be processed. + * + * @param functionName the configured function value (ignored) + * @return {@code false} always + */ @Override public boolean canProcess(String functionName) { return false; } + /** + * Always fails because the configured function is unsupported. + * + * @param rowManager the row manager wrapping the current record (unused) + * @return never returns normally + * @throws InvalidConfigurationException always, naming the unsupported function + */ public Object getResult(RowManager rowManager) { String functionName = ""; if (internalSourceConfig != null) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/JsonPayloadFunction.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/JsonPayloadFunction.java index c93ca69bd..773430642 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/JsonPayloadFunction.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/function/functions/JsonPayloadFunction.java @@ -14,12 +14,24 @@ import java.util.Map; import java.io.Serializable; +/** + * Internal post-processor function that serialises the incoming record to its JSON representation. + * + *

Selected when an internal source of type {@code function} has the value {@code JSON_PAYLOAD}. + * The proto class named under {@link #SCHEMA_PROTO_CLASS_KEY} (resolved through the stencil client) + * drives the {@link JsonRowSerializationSchema} that converts the input {@code Row} to JSON. + */ public class JsonPayloadFunction implements FunctionProcessor, Serializable { + /** The configured function value that selects this function. */ public static final String JSON_PAYLOAD_FUNCTION_KEY = "JSON_PAYLOAD"; + /** Key, within the internal processor config, naming the proto class used to build the JSON schema. */ public static final String SCHEMA_PROTO_CLASS_KEY = "schema_proto_class"; + /** The internal source configuration providing the nested processor settings. */ private InternalSourceConfig internalSourceConfig; + /** Schema/runtime context used to obtain the stencil client and proto descriptors. */ private SchemaConfig schemaConfig; + /** Lazily-built schema converting the input row into JSON; cached after first use. */ private JsonRowSerializationSchema jsonRowSerializationSchema; /** @@ -33,6 +45,12 @@ public JsonPayloadFunction(InternalSourceConfig internalSourceConfig, SchemaConf this.schemaConfig = schemaConfig; } + /** + * Indicates whether this function handles the supplied function name. + * + * @param functionName the configured function value to match + * @return {@code true} when {@code functionName} equals {@link #JSON_PAYLOAD_FUNCTION_KEY}, {@code false} otherwise + */ @Override public boolean canProcess(String functionName) { return JSON_PAYLOAD_FUNCTION_KEY.equals(functionName); @@ -51,6 +69,16 @@ public Object getResult(RowManager rowManager) { return new String(jsonRowSerializationSchema.serialize(rowManager.getInputData())); } + /** + * Builds the {@link JsonRowSerializationSchema} used to serialise input records to JSON. + * + *

The proto descriptor named by {@link #SCHEMA_PROTO_CLASS_KEY} is resolved through the + * stencil client and converted into Flink type information. + * + * @return a serialization schema matching the configured input proto descriptor + * @throws InvalidConfigurationException when the stencil client is unavailable, the internal + * processor config is missing, or it does not declare {@link #SCHEMA_PROTO_CLASS_KEY} + */ private JsonRowSerializationSchema createJsonRowSerializationSchema() { StencilClient stencilClient = schemaConfig.getStencilClientOrchestrator().getStencilClient(); if (stencilClient == null) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/invalid/InvalidInternalConfigProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/invalid/InvalidInternalConfigProcessor.java index e71953063..bd0df6e2d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/invalid/InvalidInternalConfigProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/invalid/InvalidInternalConfigProcessor.java @@ -13,6 +13,7 @@ */ public class InvalidInternalConfigProcessor implements InternalConfigProcessor, Serializable { + /** The internal source configuration whose unsupported type triggered this fallback. */ private InternalSourceConfig internalSourceConfig; /** @@ -24,11 +25,23 @@ public InvalidInternalConfigProcessor(InternalSourceConfig internalSourceConfig) this.internalSourceConfig = internalSourceConfig; } + /** + * Always reports that no internal config type can be processed. + * + * @param type the configured internal source type (ignored) + * @return {@code false} always + */ @Override public boolean canProcess(String type) { return false; } + /** + * Always fails because the configured internal source type is unsupported. + * + * @param rowManager the row manager wrapping the current record (unused) + * @throws InvalidConfigurationException always, naming the unsupported type + */ public void process(RowManager rowManager) { String type = ""; if (internalSourceConfig != null && StringUtils.isNotEmpty(internalSourceConfig.getType())) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/SqlConfigTypePathParser.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/SqlConfigTypePathParser.java index d9dfd7448..6d66917ac 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/SqlConfigTypePathParser.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/SqlConfigTypePathParser.java @@ -13,7 +13,9 @@ */ public class SqlConfigTypePathParser implements Serializable { + /** The internal source configuration supplying the input field name or select-all marker. */ private InternalSourceConfig internalSourceConfig; + /** Resolves logical column names to their input row indices. */ private ColumnNameManager columnNameManager; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalAutoFieldImport.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalAutoFieldImport.java index cc6ed6413..3fc3ff2e0 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalAutoFieldImport.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalAutoFieldImport.java @@ -10,6 +10,7 @@ */ public class SqlInternalAutoFieldImport implements SqlInternalFieldConfig { + /** Resolves logical column names to their input/output row indices. */ private ColumnNameManager columnNameManager; /** @@ -21,6 +22,14 @@ public SqlInternalAutoFieldImport(ColumnNameManager columnNameManager) { this.columnNameManager = columnNameManager; } + /** + * Copies every input column straight through to the matching output column. + * + *

For each managed input column the value is read from the input row and written into the + * output column carrying the same name, implementing the SQL select-all behaviour. + * + * @param rowManager the row manager wrapping the record whose output row is populated + */ @Override public void processInputColumns(RowManager rowManager) { for (String columnName : columnNameManager.getInputColumnNames()) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalConfigProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalConfigProcessor.java index 235c54def..b6199823b 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalConfigProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalConfigProcessor.java @@ -14,10 +14,14 @@ */ public class SqlInternalConfigProcessor implements InternalConfigProcessor, Serializable { + /** The configuration {@code type} value that selects this SQL processor. */ public static final String SQL_CONFIG_HANDLER_TYPE = "sql"; + /** Resolves logical column names to their input/output row indices. */ private ColumnNameManager columnNameManager; + /** Extracts the configured input data (single field or whole input row) for the mapping. */ private SqlConfigTypePathParser sqlPathParser; + /** The internal source configuration supplying the input value and output field. */ private InternalSourceConfig internalSourceConfig; /** @@ -33,11 +37,26 @@ public SqlInternalConfigProcessor(ColumnNameManager columnNameManager, SqlConfig this.internalSourceConfig = internalSourceConfig; } + /** + * Indicates whether this processor handles the supplied internal config type. + * + * @param type the configured internal source type + * @return {@code true} when {@code type} equals {@link #SQL_CONFIG_HANDLER_TYPE}, {@code false} otherwise + */ @Override public boolean canProcess(String type) { return SQL_CONFIG_HANDLER_TYPE.equals(type); } + /** + * Resolves and applies the appropriate SQL field mapping to the record. + * + *

A {@link SqlInternalFieldFactory} chooses between a select-all import and a single-field + * import based on the configuration, and the selected {@link SqlInternalFieldConfig} populates + * the output row. + * + * @param rowManager the row manager wrapping the record whose output row is populated + */ @Override public void process(RowManager rowManager) { SqlInternalFieldConfig sqlInternalFieldConfig = diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldFactory.java index 84555233a..b7d4005dd 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldFactory.java @@ -10,8 +10,11 @@ * The factory class for Sql internal field processor. */ public class SqlInternalFieldFactory { + /** Resolves logical column names to their input/output row indices. */ private ColumnNameManager columnNameManager; + /** Extracts the configured input data for single-field SQL mappings. */ private SqlConfigTypePathParser sqlPathParser; + /** The internal source configuration supplying the output field and select-all marker. */ private InternalSourceConfig internalSourceConfig; /** @@ -40,6 +43,11 @@ public SqlInternalFieldConfig getSqlInternalFieldConfig() { } } + /** + * Determines whether the configuration requests importing all input columns. + * + * @return {@code true} when the configured output field is the SQL select-all marker, {@code false} otherwise + */ private boolean selectAllFromInputColumns() { return Constants.SQL_PATH_SELECT_ALL_CONFIG_VALUE.equals(internalSourceConfig.getOutputField()); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldImport.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldImport.java index 6718d9d1f..fc6be68c3 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldImport.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/internal/processor/sql/fields/SqlInternalFieldImport.java @@ -11,8 +11,11 @@ */ public class SqlInternalFieldImport implements SqlInternalFieldConfig { + /** Resolves logical column names to their input/output row indices. */ private ColumnNameManager columnNameManager; + /** Extracts the configured input data (single field or whole input row) for the mapping. */ private SqlConfigTypePathParser sqlPathParser; + /** The internal source configuration supplying the input value and output field. */ private InternalSourceConfig internalSourceConfig; /** @@ -28,6 +31,14 @@ public SqlInternalFieldImport(ColumnNameManager columnNameManager, SqlConfigType this.internalSourceConfig = internalSourceConfig; } + /** + * Resolves the configured input value and writes it into the mapped output column. + * + *

When the configured output field cannot be resolved to a column index the record is left + * unchanged. + * + * @param rowManager the row manager wrapping the record whose output row is populated + */ @Override public void processInputColumns(RowManager rowManager) { int outputFieldIndex = columnNameManager.getOutputIndex(internalSourceConfig.getOutputField()); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowFactory.java index ea35c6222..30c31ec6c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowFactory.java @@ -40,12 +40,33 @@ * The factory class for Longbow. */ public class LongbowFactory { + /** + * The Longbow schema describing the column layout and Longbow type of the input stream. + */ private LongbowSchema longbowSchema; + /** + * The Dagger configuration used to resolve Longbow, BigTable and stream settings. + */ private Configuration configuration; + /** + * The async processor used to wrap the rich async functions into ordered-wait operators. + */ private AsyncProcessor asyncProcessor; + /** + * The orchestrator that supplies the Stencil client for resolving Protobuf descriptors. + */ private StencilClientOrchestrator stencilClientOrchestrator; + /** + * The exporter that the reader and writer notify so their metrics are published. + */ private MetricsTelemetryExporter metricsTelemetryExporter; + /** + * The Longbow column names derived from the schema, used to build readers and writers. + */ private String[] columnNames; + /** + * Shared {@code Gson} instance used to parse the input streams configuration JSON. + */ private static final Gson GSON = new Gson(); /** @@ -119,6 +140,16 @@ public PostProcessor getLongbowProcessor() { } } + /** + * Builds a {@link LongbowReader} for the Longbow+ read flow, where the scanned BigTable rows + * carry serialized Protobuf payloads. + * + *

The reader is wired with the range resolved from the schema, a {@code ScanRequestFactory} + * targeting the configured BigTable table, a {@link LongbowProtoData} parser and a + * {@code ReaderOutputProtoData} output mapper. + * + * @return the configured Longbow reader for the Protobuf-backed read flow + */ private LongbowReader longbowReaderPlus() { LongbowRange longbowRange = LongbowRangeFactory.getLongbowRange(longbowSchema); ScanRequestFactory scanRequestFactory = new ScanRequestFactory(longbowSchema, getTableId(configuration)); @@ -127,6 +158,16 @@ private LongbowReader longbowReaderPlus() { return new LongbowReader(configuration, longbowSchema, longbowRange, longbowTableData, scanRequestFactory, readerOutputRow); } + /** + * Builds a {@link LongbowReader} for the standard Longbow read flow, where scanned BigTable rows + * are mapped back into individual schema columns. + * + *

The reader is wired with the range resolved from the schema, a {@code ScanRequestFactory} + * targeting the configured table, a {@link LongbowTableData} parser and a + * {@code ReaderOutputLongbowData} output mapper. + * + * @return the configured Longbow reader for the column-based read flow + */ private LongbowReader longbowReader() { LongbowRange longbowRange = LongbowRangeFactory.getLongbowRange(longbowSchema); ScanRequestFactory scanRequestFactory = new ScanRequestFactory(longbowSchema, getTableId(configuration)); @@ -135,6 +176,16 @@ private LongbowReader longbowReader() { return new LongbowReader(configuration, longbowSchema, longbowRange, longbowTableData, scanRequestFactory, readerOutputRow); } + /** + * Builds a {@link LongbowWriter} for the Longbow+ write flow, serializing the input row into a + * Protobuf payload before persisting it to BigTable. + * + *

A {@link ProtoSerializer} is created from the configured input Protobuf class, and the + * writer is given a {@code PutRequestFactory} plus an {@code OutputSynchronizer} that records the + * synchronization metadata for the downstream reader. + * + * @return the configured Longbow writer for the Protobuf-backed write flow + */ private LongbowWriter longbowWriterPlus() { ProtoSerializer protoSerializer = new ProtoSerializer(null, getMessageProtoClassName(configuration), columnNames, stencilClientOrchestrator); String tableId = getTableId(configuration); @@ -143,6 +194,15 @@ private LongbowWriter longbowWriterPlus() { return new LongbowWriter(configuration, longbowSchema, putRequestFactory, tableId, outputSynchronizer); } + /** + * Builds a {@link LongbowWriter} for the standard Longbow write flow, persisting the row columns + * directly to BigTable. + * + *

The writer uses a {@code PutRequestFactory} without a serializer and an + * {@code OutputIdentity} that passes the input row through unchanged. + * + * @return the configured Longbow writer for the column-based write flow + */ private LongbowWriter longbowWriter() { String tableId = getTableId(configuration); PutRequestFactory putRequestFactory = new PutRequestFactory(longbowSchema, null, tableId); @@ -150,11 +210,26 @@ private LongbowWriter longbowWriter() { return new LongbowWriter(configuration, longbowSchema, putRequestFactory, tableId, outputIdentity); } + /** + * Resolves the BigTable table id to use for Longbow operations. + * + *

The explicit Longbow GCP table id is preferred; when it is absent the Dagger job name (or + * its default) is used instead. + * + * @param config the configuration to read the table id and Dagger name from + * @return the resolved BigTable table id + */ private String getTableId(Configuration config) { return config .getString(PROCESSOR_LONGBOW_GCP_TABLE_ID_KEY, config.getString(DAGGER_NAME_KEY, DAGGER_NAME_DEFAULT)); } + /** + * Extracts the input Protobuf message class name from the first configured input stream. + * + * @param config the configuration holding the input streams JSON + * @return the fully-qualified Protobuf class name of the first input stream + */ private String getMessageProtoClassName(Configuration config) { String jsonArrayString = config.getString(INPUT_STREAMS, ""); Map[] streamsConfig = GSON.fromJson(jsonArrayString, Map[].class); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowProcessor.java index e5edbfd23..6ea576678 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowProcessor.java @@ -19,9 +19,21 @@ */ public class LongbowProcessor implements PostProcessor { + /** + * The async processor used to attach each rich async function as an ordered-wait operator. + */ private AsyncProcessor asyncProcessor; + /** + * The Dagger configuration providing the Longbow async timeout and thread capacity. + */ private Configuration configuration; + /** + * The ordered list of Longbow rich async functions (writer and/or reader) applied to the stream. + */ private ArrayList> longbowRichFunctions; + /** + * The column modifier that adjusts the output column names for the chosen Longbow type. + */ private ColumnModifier modifier; /** @@ -39,6 +51,16 @@ public LongbowProcessor(AsyncProcessor asyncProcessor, Configuration configurati this.modifier = modifier; } + /** + * Applies the configured Longbow rich async functions to the incoming stream in order. + * + *

Each function is wrapped in an ordered-wait async operator using the Longbow async timeout + * and thread capacity from the configuration. The resulting stream is returned together with the + * column names produced by the {@link ColumnModifier}. + * + * @param streamInfo the incoming stream and its column names + * @return a new {@link StreamInfo} wrapping the Longbow-processed stream and modified column names + */ @Override public StreamInfo process(StreamInfo streamInfo) { DataStream inputStream = streamInfo.getDataStream(); @@ -51,6 +73,15 @@ public StreamInfo process(StreamInfo streamInfo) { return new StreamInfo(outputStream, modifier.modifyColumnNames(streamInfo.getColumnNames())); } + /** + * Indicates whether this post processor can handle the given configuration. + * + *

The Longbow processor is always constructed explicitly by the {@code LongbowFactory} rather + * than selected from configuration, so this always returns {@code false}. + * + * @param postProcessorConfig the post processor configuration to test + * @return {@code false}, as this processor is never chosen via configuration matching + */ @Override public boolean canProcess(PostProcessorConfig postProcessorConfig) { return false; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowSchema.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowSchema.java index f486278a8..463063ed4 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowSchema.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/LongbowSchema.java @@ -25,7 +25,13 @@ * A class that holds the Longbow schema. */ public class LongbowSchema implements Serializable { + /** + * Maps each column name to its positional index within the input {@link Row}. + */ private HashMap columnIndexMap; + /** + * The ordered list of column names that make up the Longbow input schema. + */ private List columnNames; /** @@ -190,6 +196,12 @@ public boolean isLongbowPlus() { return getType() != LongbowType.LongbowProcess; } + /** + * Normalizes a rowtime field value into a {@link Timestamp}. + * + * @param timeStampField the rowtime field value, either a {@link LocalDateTime} or a {@link Timestamp} + * @return the value as a {@link Timestamp} + */ private Timestamp convertToTimeStamp(Object timeStampField) { if (timeStampField instanceof LocalDateTime) { return Timestamp.valueOf((LocalDateTime) timeStampField); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowReadColumnModifier.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowReadColumnModifier.java index 7c75cb988..88fa5893d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowReadColumnModifier.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowReadColumnModifier.java @@ -10,6 +10,15 @@ */ public class LongbowReadColumnModifier implements ColumnModifier { + /** + * Appends the Longbow Protobuf data column to the input column names. + * + *

Used by the Longbow+ read flow so that the serialized Protobuf payload scanned from BigTable + * is exposed as an additional output column. + * + * @param inputColumnNames the incoming column names + * @return the column names with the Longbow proto data column appended + */ @Override public String[] modifyColumnNames(String[] inputColumnNames) { ArrayList inputColumnList = new ArrayList<>(Arrays.asList(inputColumnNames)); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowWriteColumnModifier.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowWriteColumnModifier.java index 156aa003c..9cb9ba162 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowWriteColumnModifier.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/LongbowWriteColumnModifier.java @@ -10,6 +10,15 @@ */ public class LongbowWriteColumnModifier implements ColumnModifier { + /** + * Appends the synchronizer metadata columns to the input column names. + * + *

Used by the Longbow write flow to expose the BigTable table id, the input class name and the + * Longbow read key needed to synchronize with the downstream reader. + * + * @param inputColumnNames the incoming column names + * @return the column names with the synchronizer columns appended + */ @Override public String[] modifyColumnNames(String[] inputColumnNames) { ArrayList outputList = new ArrayList<>(Arrays.asList(inputColumnNames)); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/NoOpColumnModifier.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/NoOpColumnModifier.java index a93b40a0c..f5a777501 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/NoOpColumnModifier.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/columnmodifier/NoOpColumnModifier.java @@ -4,6 +4,14 @@ * The No op column modifier. */ public class NoOpColumnModifier implements ColumnModifier { + /** + * Returns the input column names unchanged. + * + *

Used by the combined Longbow process flow, which neither adds nor removes any columns. + * + * @param inputColumnNames the incoming column names + * @return the same column names that were passed in + */ @Override public String[] modifyColumnNames(String[] inputColumnNames) { return inputColumnNames; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowDataFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowDataFactory.java index ac750eb0b..10aaf7de5 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowDataFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowDataFactory.java @@ -6,6 +6,9 @@ * The factory class for Longbow data. */ public class LongbowDataFactory { + /** + * The Longbow schema used to decide which {@link LongbowData} implementation to create. + */ private LongbowSchema longbowSchema; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowProtoData.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowProtoData.java index 7de943df4..cc4750fc6 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowProtoData.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowProtoData.java @@ -13,6 +13,9 @@ * The Longbow proto data. */ public class LongbowProtoData implements LongbowData { + /** + * The BigTable column family, in bytes, that Longbow data is stored under. + */ private static final byte[] COLUMN_FAMILY_NAME = Bytes.toBytes(Constants.LONGBOW_COLUMN_FAMILY_DEFAULT); /** @@ -21,6 +24,13 @@ public class LongbowProtoData implements LongbowData { public LongbowProtoData() { } + /** + * Extracts the serialized Protobuf payloads from the scanned BigTable rows. + * + * @param scanResult the BigTable scan results, ordered from latest to earliest + * @return a map with a single entry keyed by the Longbow proto data key whose value is the list + * of raw payload byte arrays, one per scanned row + */ @Override public Map> parse(List scanResult) { ArrayList data = new ArrayList<>(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowTableData.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowTableData.java index f131e7587..bb9ec05fe 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowTableData.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/data/LongbowTableData.java @@ -16,7 +16,13 @@ */ public class LongbowTableData implements LongbowData { + /** + * The BigTable column family, in bytes, that Longbow data is stored under. + */ private static final byte[] COLUMN_FAMILY_NAME = Bytes.toBytes(Constants.LONGBOW_COLUMN_FAMILY_DEFAULT); + /** + * The Longbow schema used to resolve which data columns to read from the scan results. + */ private LongbowSchema longbowSchema; /** @@ -28,6 +34,15 @@ public LongbowTableData(LongbowSchema longbowSchema) { this.longbowSchema = longbowSchema; } + /** + * Groups the scanned BigTable rows by Longbow data column name. + * + *

For each schema column that holds Longbow data an entry is produced; the value is the list + * of cell values across the scan results, or an empty list when the scan returned nothing. + * + * @param scanResult the BigTable scan results to read from + * @return a map from each Longbow data column name to its list of string values + */ @Override public Map> parse(List scanResult) { Map> longbowData = new HashMap<>(); @@ -40,6 +55,13 @@ public Map> parse(List scanResult) { return longbowData; } + /** + * Reads the values of a single column across all scan results. + * + * @param resultScan the BigTable scan results to read from + * @param name the column qualifier to extract + * @return the list of string values for the given column, one per result + */ private List getData(List resultScan, String name) { return resultScan .stream() diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputIdentity.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputIdentity.java index 0d09e19ec..f9628a89e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputIdentity.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputIdentity.java @@ -6,6 +6,15 @@ * The Output identity. */ public class OutputIdentity implements WriterOutputRow { + /** + * Returns the input row unchanged. + * + *

This identity implementation is selected when the Longbow writer should emit the original + * record as-is, without appending any synchronizer metadata columns. + * + * @param input the row to pass through + * @return the same {@code input} row, unmodified + */ @Override public Row get(Row input) { return input; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputSynchronizer.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputSynchronizer.java index ac021e2ac..cb3ecfbf8 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputSynchronizer.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/OutputSynchronizer.java @@ -12,8 +12,17 @@ * The Output synchronizer. */ public class OutputSynchronizer implements WriterOutputRow { + /** + * Schema describing the Longbow columns and key layout for the current job. + */ private LongbowSchema longbowSchema; + /** + * Identifier of the BigTable table that received the written record. + */ private String tableId; + /** + * Fully qualified name of the input Protobuf message class. + */ private String inputProto; /** @@ -29,6 +38,17 @@ public OutputSynchronizer(LongbowSchema longbowSchema, String tableId, String in this.inputProto = inputProto; } + /** + * Builds the synchronizer output row for a written Longbow record. + * + *

The returned row copies every field of {@code input} and appends three extra fields: the + * BigTable table id, the input Protobuf class name, and the Longbow key extracted from the + * configured write key column. The output arity is the input arity plus + * {@code Constants.LONGBOW_OUTPUT_ADDITIONAL_ARITY}. + * + * @param input the input row produced upstream of the Longbow writer + * @return a new row containing the original fields followed by the synchronizer metadata + */ @Override public Row get(Row input) { int outputArity = input.getArity() + Constants.LONGBOW_OUTPUT_ADDITIONAL_ARITY; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputLongbowData.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputLongbowData.java index 000919a2e..0aff2ec50 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputLongbowData.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputLongbowData.java @@ -12,6 +12,9 @@ * The Reader output longbow data. */ public class ReaderOutputLongbowData implements ReaderOutputRow { + /** + * Schema used to resolve column names and indices when assembling the output row. + */ private LongbowSchema longbowSchema; /** @@ -23,6 +26,17 @@ public ReaderOutputLongbowData(LongbowSchema longbowSchema) { this.longbowSchema = longbowSchema; } + /** + * Merges the BigTable scan result with the input row into a Longbow output row. + * + *

Non Longbow-data columns are copied from {@code input}, then every entry of + * {@code scanResult} overrides or adds to that map. Each value is finally written into a new row + * at the index assigned to its column by the {@link LongbowSchema}. + * + * @param scanResult the parsed Longbow data keyed by output column name + * @param input the input row that triggered the lookup + * @return a new row populated with the merged input and scanned Longbow data + */ @Override public Row get(Map scanResult, Row input) { HashMap columnMap = new HashMap<>(); @@ -37,6 +51,12 @@ public Row get(Map scanResult, Row input) { return output; } + /** + * Determines whether the given schema column holds Longbow data. + * + * @param c a schema entry mapping a column name to its row index + * @return {@code true} if the column name contains the Longbow data key marker, otherwise {@code false} + */ private boolean isLongbowData(Map.Entry c) { return c.getKey().contains(Constants.LONGBOW_DATA_KEY); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputProtoData.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputProtoData.java index 3fcca44e3..edf1797bb 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputProtoData.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/outputRow/ReaderOutputProtoData.java @@ -12,6 +12,9 @@ * The Reader output proto data. */ public class ReaderOutputProtoData implements ReaderOutputRow { + /** + * Schema used to resolve column names and indices when assembling the output row. + */ private LongbowSchema longbowSchema; /** @@ -23,6 +26,17 @@ public ReaderOutputProtoData(LongbowSchema longbowSchema) { this.longbowSchema = longbowSchema; } + /** + * Builds a Longbow output row that carries the scanned Protobuf bytes column. + * + *

All non proto-data columns are copied from {@code input} into their schema-assigned indices, + * and the serialized Protobuf payload retrieved from {@code scanResult} under + * {@code Constants.LONGBOW_PROTO_DATA_KEY} is appended as the final field. + * + * @param scanResult the parsed scan result containing the Longbow proto-data payload + * @param input the input row that triggered the lookup + * @return a new row with the input columns followed by the scanned proto-data value + */ @Override public Row get(Map scanResult, Row input) { HashMap columnMap = new HashMap<>(); @@ -37,6 +51,12 @@ public Row get(Map scanResult, Row input) { return output; } + /** + * Determines whether the given schema column holds the Longbow proto-data payload. + * + * @param c a schema entry mapping a column name to its row index + * @return {@code true} if the column name contains the Longbow proto-data key marker, otherwise {@code false} + */ private boolean isLongbowProtoData(Map.Entry c) { return c.getKey().contains(Constants.LONGBOW_PROTO_DATA_KEY); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowReader.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowReader.java index 3b2ea681e..708a530a0 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowReader.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowReader.java @@ -40,16 +40,49 @@ */ public class LongbowReader extends RichAsyncFunction implements TelemetryPublisher { + /** + * Logger used to record scan failures and timeout events for this reader. + */ private static final Logger LOGGER = LoggerFactory.getLogger(LongbowReader.class.getName()); + /** + * Dagger configuration used to lazily build the BigTable-backed {@link LongbowStore}. + */ private Configuration configuration; + /** + * Schema describing the Longbow columns, keys and durations for the current job. + */ private LongbowSchema longBowSchema; + /** + * Strategy that derives the scan key range (lower and upper bounds) from each input row. + */ private LongbowRange longbowRange; + /** + * Client that issues asynchronous scans against the BigTable storage. + */ private LongbowStore longBowStore; + /** + * Manager used to emit meters and histograms for reader instrumentation. + */ private MeterStatsManager meterStatsManager; + /** + * Telemetry collected for this processor, keyed by telemetry type. + */ private Map> metrics = new HashMap<>(); + /** + * Reporter used to surface fatal and non-fatal reader exceptions. + */ private ErrorReporter errorReporter; + /** + * Strategy that parses raw BigTable scan results into output column values. + */ private LongbowData longbowData; + /** + * Factory that builds the appropriate {@link ScanRequest} for each input row. + */ private ScanRequestFactory scanRequestFactory; + /** + * Strategy that assembles the emitted row from the parsed scan result and input. + */ private ReaderOutputRow readerOutputRow; /** @@ -92,6 +125,16 @@ public LongbowReader(Configuration configuration, LongbowSchema longBowSchema, L } + /** + * {@inheritDoc} + * + *

Lazily initialises the BigTable {@link LongbowStore}, the {@link MeterStatsManager} and the + * {@link ErrorReporter} when they were not injected, then registers the reader meter group so + * that {@link LongbowReaderAspects} can be reported. + * + * @param internalFlinkConfig the Flink runtime configuration supplied when the function opens + * @throws Exception if the underlying store or metric resources cannot be initialised + */ @Override public void open(org.apache.flink.configuration.Configuration internalFlinkConfig) throws Exception { super.open(internalFlinkConfig); @@ -107,11 +150,25 @@ public void open(org.apache.flink.configuration.Configuration internalFlinkConfi meterStatsManager.register("longbow.reader", LongbowReaderAspects.values()); } + /** + * {@inheritDoc} + * + *

Registers the post-processor type telemetry for the Longbow reader before subscribers are + * notified. + */ @Override public void preProcessBeforeNotifyingSubscriber() { addMetric(TelemetryTypes.POST_PROCESSOR_TYPE.getValue(), Constants.LONGBOW_READER_PROCESSOR_KEY); } + /** + * {@inheritDoc} + * + *

Marks the close-connection event, logs the closure and releases the underlying + * {@link LongbowStore} if it was opened. + * + * @throws Exception if closing the parent function or the store fails + */ @Override public void close() throws Exception { super.close(); @@ -122,6 +179,17 @@ public void close() throws Exception { } } + /** + * {@inheritDoc} + * + *

Builds a {@link ScanRequest} for the input row, scans BigTable asynchronously and, once the + * scan completes, records instrumentation, parses the scanned data and completes the + * {@code resultFuture} with the assembled output row. Scan failures are logged and converted into + * an empty result. + * + * @param input the input row to enrich with Longbow data + * @param resultFuture the future completed with the single enriched output row + */ @Override public void asyncInvoke(Row input, ResultFuture resultFuture) { ScanRequest scanRequest = scanRequestFactory.create(input, longbowRange); @@ -144,6 +212,17 @@ public LongbowRange getLongbowRange() { return longbowRange; } + /** + * Records reader success metrics for a completed scan. + * + *

Marks the read-success event, updates the response-time and documents-per-scan histograms, + * and flags a failure-to-read-last-record event when the scan is empty or its first row key does + * not match the expected Longbow key for {@code input}. + * + * @param scanResult the list of BigTable results returned by the scan + * @param startTime the instant the scan started, used to compute the response time + * @param input the input row whose Longbow key is validated against the result + */ private void instrumentation(List scanResult, Instant startTime, Row input) { meterStatsManager.markEvent(LongbowReaderAspects.SUCCESS_ON_READ_DOCUMENT); meterStatsManager.updateHistogram(LongbowReaderAspects.SUCCESS_ON_READ_DOCUMENT_RESPONSE_TIME, between(startTime, Instant.now()).toMillis()); @@ -153,6 +232,17 @@ private void instrumentation(List scanResult, Instant startTime, Row inp } } + /** + * Handles a failed BigTable scan by logging and reporting it. + * + *

Logs the error, marks the read-failure event, reports a non-fatal + * {@code LongbowReaderException}, records the failure response time and yields an empty result so + * that the asynchronous pipeline can continue. + * + * @param ex the throwable raised while scanning BigTable + * @param startTime the instant the scan started, used to compute the response time + * @return an empty list of results, used as the fallback scan outcome + */ private List logException(Throwable ex, Instant startTime) { LOGGER.error("LongbowReader : failed to scan document from BigTable: {}", ex.getMessage()); ex.printStackTrace(); @@ -162,6 +252,15 @@ private List logException(Throwable ex, Instant startTime) { return Collections.emptyList(); } + /** + * {@inheritDoc} + * + *

Marks a reader timeout event, reports a fatal {@link TimeoutException} and completes the + * {@code resultFuture} exceptionally. + * + * @param input the input row whose asynchronous lookup timed out + * @param resultFuture the future completed exceptionally with the timeout error + */ @Override public void timeout(Row input, ResultFuture resultFuture) { LOGGER.error("LongbowReader : timeout when reading document"); @@ -171,11 +270,22 @@ public void timeout(Row input, ResultFuture resultFuture) { resultFuture.completeExceptionally(timeoutException); } + /** + * {@inheritDoc} + * + * @return the telemetry collected by this reader, keyed by telemetry type + */ @Override public Map> getTelemetry() { return metrics; } + /** + * Appends a telemetry value under the given key. + * + * @param key the telemetry type key to record under + * @param value the telemetry value to add for that key + */ private void addMetric(String key, String value) { metrics.computeIfAbsent(key, k -> new ArrayList<>()).add(value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowWriter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowWriter.java index e8a608c71..30db52b1e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowWriter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/processor/LongbowWriter.java @@ -39,18 +39,54 @@ */ public class LongbowWriter extends RichAsyncFunction implements TelemetryPublisher { + /** + * Logger used to record table-creation and write failures for this writer. + */ private static final Logger LOGGER = LoggerFactory.getLogger(LongbowWriter.class.getName()); + /** + * Default BigTable column family, in bytes, under which Longbow values are written. + */ private static final byte[] COLUMN_FAMILY_NAME = Bytes.toBytes(Constants.LONGBOW_COLUMN_FAMILY_DEFAULT); + /** + * Manager used to emit meters and histograms for writer instrumentation. + */ private MeterStatsManager meterStatsManager; + /** + * Schema describing the Longbow columns, keys and document duration for the current job. + */ private LongbowSchema longbowSchema; + /** + * Dagger configuration used to read writer settings and build the {@link LongbowStore}. + */ private Configuration configuration; + /** + * Configured time-to-live for written documents, used to derive the BigTable max-age GC rule. + */ private String longbowDocumentDuration; + /** + * Factory that builds the appropriate {@link PutRequest} for each input row. + */ private PutRequestFactory putRequestFactory; + /** + * Identifier of the BigTable table this writer persists records into. + */ private String tableId; + /** + * Strategy that assembles the row emitted after a successful write. + */ private WriterOutputRow writerOutputRow; + /** + * Client that creates tables and issues asynchronous writes against BigTable. + */ private LongbowStore longBowStore; + /** + * Telemetry collected for this processor, keyed by telemetry type. + */ private Map> metrics = new HashMap<>(); + /** + * Reporter used to surface fatal and non-fatal writer exceptions. + */ private ErrorReporter errorReporter; /** @@ -93,6 +129,18 @@ public LongbowWriter(Configuration configuration, LongbowSchema longbowSchema, P this.errorReporter = errorReporter; } + /** + * {@inheritDoc} + * + *

Lazily initialises the {@link LongbowStore}, {@link MeterStatsManager} and + * {@link ErrorReporter} when they were not injected, registers the writer meter group, and + * creates the target BigTable table if it does not yet exist. Table creation applies a + * max-versions of one together with a max-age derived from the configured document duration, and + * is instrumented for both success and failure. + * + * @param internalFlinkConfig the Flink runtime configuration supplied when the function opens + * @throws Exception if the store cannot be created or the table cannot be provisioned + */ @Override public void open(org.apache.flink.configuration.Configuration internalFlinkConfig) throws Exception { super.open(internalFlinkConfig); @@ -131,11 +179,28 @@ public void open(org.apache.flink.configuration.Configuration internalFlinkConfi } } + /** + * {@inheritDoc} + * + *

Registers the post-processor type telemetry for the Longbow writer before subscribers are + * notified. + */ @Override public void preProcessBeforeNotifyingSubscriber() { addMetric(TelemetryTypes.POST_PROCESSOR_TYPE.getValue(), Constants.LONGBOW_WRITER_PROCESSOR_KEY); } + /** + * {@inheritDoc} + * + *

Builds a {@link PutRequest} for the input row and writes it to BigTable asynchronously. On + * success it records the write metrics and completes {@code resultFuture} with the row produced by + * the configured {@link WriterOutputRow}; failures are logged and reported. + * + * @param input the input row to persist into BigTable + * @param resultFuture the future completed with the single output row once the write succeeds + * @throws Exception if the put request cannot be created or submitted + */ @Override public void asyncInvoke(Row input, ResultFuture resultFuture) throws Exception { PutRequest putRequest = putRequestFactory.create(input); @@ -149,6 +214,16 @@ public void asyncInvoke(Row input, ResultFuture resultFuture) throws Except }); } + /** + * Handles a failed BigTable write by logging and reporting it. + * + *

Logs the error, marks the write-failure event, reports a non-fatal + * {@code LongbowWriterException} and records the failure response time. + * + * @param ex the throwable raised while writing to BigTable + * @param startTime the instant the write started, used to compute the response time + * @return {@code null} always, matching the {@code Void} completion stage signature + */ private Void logException(Throwable ex, Instant startTime) { LOGGER.error("failed to write document to table '{}'", tableId); ex.printStackTrace(); @@ -159,6 +234,16 @@ private Void logException(Throwable ex, Instant startTime) { return null; } + /** + * Handles an asynchronous write that exceeded its configured timeout. + * + *

Marks a writer timeout event, reports a fatal {@link TimeoutException} and completes the + * {@code resultFuture} exceptionally. + * + * @param input the input row whose asynchronous write timed out + * @param resultFuture the future completed exceptionally with the timeout error + * @throws Exception if reporting the timeout fails + */ public void timeout(Row input, ResultFuture resultFuture) throws Exception { LOGGER.error("LongbowWriter : timeout when writing document"); meterStatsManager.markEvent(LongbowWriterAspects.TIMEOUTS_ON_WRITER); @@ -167,6 +252,14 @@ public void timeout(Row input, ResultFuture resultFuture) throws Exception resultFuture.completeExceptionally(timeoutException); } + /** + * {@inheritDoc} + * + *

Releases the underlying {@link LongbowStore} if it was opened, marks the close-connection + * event and logs the closure. + * + * @throws Exception if closing the parent function or the store fails + */ @Override public void close() throws Exception { super.close(); @@ -177,11 +270,22 @@ public void close() throws Exception { LOGGER.error("LongbowWriter : Connection closed"); } + /** + * {@inheritDoc} + * + * @return the telemetry collected by this writer, keyed by telemetry type + */ @Override public Map> getTelemetry() { return metrics; } + /** + * Appends a telemetry value under the given key. + * + * @param key the telemetry type key to record under + * @param value the telemetry value to add for that key + */ private void addMetric(String key, String value) { metrics.computeIfAbsent(key, k -> new ArrayList<>()).add(value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowAbsoluteRange.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowAbsoluteRange.java index 01476e38a..ec46063b4 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowAbsoluteRange.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowAbsoluteRange.java @@ -9,6 +9,9 @@ * Absolute range on Longbow. */ public class LongbowAbsoluteRange implements LongbowRange { + /** + * The Longbow schema used to compute absolute row keys from the input row. + */ private LongbowSchema longbowSchema; /** @@ -20,16 +23,33 @@ public LongbowAbsoluteRange(LongbowSchema longbowSchema) { this.longbowSchema = longbowSchema; } + /** + * Computes the upper-bound BigTable row key for the scan from the absolute latest timestamp. + * + * @param input the input row carrying the Longbow key and latest-timestamp field + * @return the upper-bound row key as a byte array + */ @Override public byte[] getUpperBound(Row input) { return longbowSchema.getAbsoluteKey(input, (long) longbowSchema.getValue(input, Constants.LONGBOW_LATEST_KEY)); } + /** + * Computes the lower-bound BigTable row key for the scan from the absolute earliest timestamp. + * + * @param input the input row carrying the Longbow key and earliest-timestamp field + * @return the lower-bound row key as a byte array + */ @Override public byte[] getLowerBound(Row input) { return longbowSchema.getAbsoluteKey(input, (long) longbowSchema.getValue(input, Constants.LONGBOW_EARLIEST_KEY)); } + /** + * Returns the fields that must not be present when an absolute range is used. + * + * @return an array containing the Longbow duration key, which is invalid for absolute ranges + */ @Override public String[] getInvalidFields() { return new String[]{Constants.LONGBOW_DURATION_KEY}; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowDurationRange.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowDurationRange.java index 6eb4d2ee1..7893f9fe4 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowDurationRange.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/range/LongbowDurationRange.java @@ -9,6 +9,9 @@ * Duration range on Longbow. */ public class LongbowDurationRange implements LongbowRange { + /** + * The Longbow schema used to compute row keys and resolve the lookback duration. + */ private LongbowSchema longbowSchema; /** @@ -20,16 +23,35 @@ public LongbowDurationRange(LongbowSchema longbowSchema) { this.longbowSchema = longbowSchema; } + /** + * Computes the upper-bound BigTable row key for the scan, anchored at the row's event time. + * + * @param input the input row carrying the Longbow key and rowtime field + * @return the upper-bound row key as a byte array + */ @Override public byte[] getUpperBound(Row input) { return longbowSchema.getKey(input, 0); } + /** + * Computes the lower-bound BigTable row key for the scan by subtracting the configured duration + * from the row's event time. + * + * @param input the input row carrying the Longbow key, rowtime and duration fields + * @return the lower-bound row key as a byte array + */ @Override public byte[] getLowerBound(Row input) { return longbowSchema.getKey(input, longbowSchema.getDurationInMillis(input)); } + /** + * Returns the fields that must not be present when a duration range is used. + * + * @return an array containing the Longbow earliest and latest keys, which are invalid for + * duration ranges + */ @Override public String[] getInvalidFields() { return new String[]{Constants.LONGBOW_EARLIEST_KEY, Constants.LONGBOW_LATEST_KEY}; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoBytePutRequest.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoBytePutRequest.java index b76e52a06..dbc11cfea 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoBytePutRequest.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoBytePutRequest.java @@ -18,11 +18,29 @@ * Create PutRequest in form of proto byte. */ public class ProtoBytePutRequest implements PutRequest { + /** + * Default BigTable column family, in bytes, under which the serialized record is stored. + */ private static final byte[] COLUMN_FAMILY_NAME = Bytes.toBytes(Constants.LONGBOW_COLUMN_FAMILY_DEFAULT); + /** + * Default BigTable column qualifier, in bytes, holding the serialized proto value. + */ private static final byte[] QUALIFIER_NAME = Bytes.toBytes(Constants.LONGBOW_QUALIFIER_DEFAULT); + /** + * Schema describing the Longbow key layout used to build the BigTable row key. + */ private final LongbowSchema longbowSchema; + /** + * Input row to be serialized and written to BigTable. + */ private final Row input; + /** + * Serializer that converts the input row into its Protobuf byte representation. + */ private final ProtoSerializer protoSerializer; + /** + * Identifier of the BigTable table this put targets. + */ private final String tableId; @@ -41,6 +59,15 @@ public ProtoBytePutRequest(LongbowSchema longbowSchema, Row input, ProtoSerializ this.tableId = tableId; } + /** + * Builds the BigTable {@link Put} for the input row in Longbow-plus (proto byte) form. + * + *

The row key is derived from the Longbow key, and a single cell is written under the default + * column family and qualifier with the row time as its timestamp and the serialized Protobuf bytes + * as its value. + * + * @return the assembled {@link Put} request + */ @Override public Put get() { Put putRequest = new Put(longbowSchema.getKey(input, 0)); @@ -49,11 +76,22 @@ public Put get() { return putRequest; } + /** + * {@inheritDoc} + * + * @return the identifier of the target BigTable table + */ @Override public String getTableId() { return this.tableId; } + /** + * Normalises a row-time field into a {@link Timestamp}. + * + * @param timeStampField the row-time value, either a {@link LocalDateTime} or a {@link Timestamp} + * @return the equivalent {@link Timestamp} + */ private Timestamp convertToTimeStamp(Object timeStampField) { if (timeStampField instanceof LocalDateTime) { return Timestamp.valueOf((LocalDateTime) timeStampField); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoByteScanRequest.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoByteScanRequest.java index 0c475dffb..66ba683dd 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoByteScanRequest.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ProtoByteScanRequest.java @@ -9,10 +9,25 @@ * Request scan the proto byte. */ public class ProtoByteScanRequest implements ScanRequest { + /** + * Default BigTable column family, in bytes, scanned for the serialized record. + */ private static final byte[] COLUMN_FAMILY_NAME = Bytes.toBytes(Constants.LONGBOW_COLUMN_FAMILY_DEFAULT); + /** + * Default BigTable column qualifier, in bytes, holding the serialized proto value. + */ private static final byte[] QUALIFIER_NAME = Bytes.toBytes(Constants.LONGBOW_QUALIFIER_DEFAULT); + /** + * Inclusive start row key of the scan range. + */ private byte[] startRow; + /** + * Inclusive stop row key of the scan range. + */ private byte[] stopRow; + /** + * Identifier of the BigTable table to scan. + */ private String tableId; @@ -29,6 +44,14 @@ public ProtoByteScanRequest(byte[] startRow, byte[] stopRow, String tableId) { this.tableId = tableId; } + /** + * Builds the BigTable {@link Scan} for the configured range in Longbow-plus (proto byte) form. + * + *

The scan covers the start-to-stop row range and reads only the default column family and + * qualifier that hold the serialized Protobuf value. + * + * @return the assembled {@link Scan} request + */ @Override public Scan get() { Scan scan = setScanRange(startRow, stopRow); @@ -36,6 +59,11 @@ public Scan get() { return scan; } + /** + * {@inheritDoc} + * + * @return the identifier of the table to scan + */ @Override public String getTableId() { return tableId; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/PutRequestFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/PutRequestFactory.java index 7a129eefa..e89d9664a 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/PutRequestFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/PutRequestFactory.java @@ -13,8 +13,17 @@ */ public class PutRequestFactory implements Serializable { + /** + * Schema used to decide between the table and proto-byte put representations. + */ private final LongbowSchema longbowSchema; + /** + * Serializer passed to the proto-byte put request when Longbow-plus is enabled. + */ private final ProtoSerializer protoSerializer; + /** + * Identifier of the BigTable table the created put requests target. + */ private final String tableId; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ScanRequestFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ScanRequestFactory.java index cae40431a..71c9b71aa 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ScanRequestFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/ScanRequestFactory.java @@ -13,7 +13,13 @@ * The factory class for scan request. */ public class ScanRequestFactory implements Serializable { + /** + * Schema used to decide between the table and proto-byte scan representations. + */ private LongbowSchema longbowSchema; + /** + * Identifier of the BigTable table the created scan requests target. + */ private String tableId; /** @@ -42,6 +48,12 @@ public ScanRequest create(Row input, LongbowRange longbowRange) { } } + /** + * Resolves the BigTable table name carried by the input row for Longbow-plus scans. + * + * @param input the input row holding the synchronizer-provided table id + * @return the table name read from the {@code Constants.SYNCHRONIZER_BIGTABLE_TABLE_ID_KEY} column + */ private String parseTableName(Row input) { return (String) longbowSchema.getValue(input, Constants.SYNCHRONIZER_BIGTABLE_TABLE_ID_KEY); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TablePutRequest.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TablePutRequest.java index e8203c6a2..3a637b8df 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TablePutRequest.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TablePutRequest.java @@ -19,10 +19,22 @@ */ public class TablePutRequest implements PutRequest { + /** + * Default BigTable column family, in bytes, under which Longbow data columns are written. + */ private static final byte[] COLUMN_FAMILY_NAME = Bytes.toBytes(Constants.LONGBOW_COLUMN_FAMILY_DEFAULT); + /** + * Schema describing the Longbow key and the data columns to persist. + */ private LongbowSchema longbowSchema; + /** + * Input row whose Longbow data columns are written to BigTable. + */ private Row input; + /** + * Identifier of the BigTable table this put targets. + */ private String tableId; /** @@ -38,6 +50,15 @@ public TablePutRequest(LongbowSchema longbowSchema, Row input, String tableId) { this.tableId = tableId; } + /** + * Builds the BigTable {@link Put} for the input row in table (column-per-field) form. + * + *

The row key is derived from the Longbow key, and one cell is added per Longbow data column, + * each written under the default column family with the row time as timestamp and the column's + * string value as bytes. + * + * @return the assembled {@link Put} request + */ @Override public Put get() { Put putRequest = new Put(longbowSchema.getKey(input, 0)); @@ -48,6 +69,12 @@ public Put get() { return putRequest; } + /** + * Normalises a row-time field into a {@link Timestamp}. + * + * @param timeStampField the row-time value, either a {@link LocalDateTime} or a {@link Timestamp} + * @return the equivalent {@link Timestamp} + */ private Timestamp convertToTimeStamp(Object timeStampField) { if (timeStampField instanceof LocalDateTime) { return Timestamp.valueOf((LocalDateTime) timeStampField); @@ -55,6 +82,11 @@ private Timestamp convertToTimeStamp(Object timeStampField) { return (Timestamp) timeStampField; } + /** + * {@inheritDoc} + * + * @return the identifier of the target BigTable table + */ @Override public String getTableId() { return this.tableId; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TableScanRequest.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TableScanRequest.java index 1ba069be9..0c15cbf72 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TableScanRequest.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/request/TableScanRequest.java @@ -12,10 +12,25 @@ * Request scan the table. */ public class TableScanRequest implements ScanRequest { + /** + * Default BigTable column family, in bytes, scanned for Longbow data columns. + */ private static final byte[] COLUMN_FAMILY_NAME = Bytes.toBytes(Constants.LONGBOW_COLUMN_FAMILY_DEFAULT); + /** + * Inclusive start row key of the scan range. + */ private byte[] startRow; + /** + * Inclusive stop row key of the scan range. + */ private byte[] stopRow; + /** + * Schema describing which Longbow data columns to read. + */ private LongbowSchema longbowSchema; + /** + * Identifier of the BigTable table to scan. + */ private String tableId; /** @@ -33,6 +48,14 @@ public TableScanRequest(byte[] startRow, byte[] stopRow, LongbowSchema longbowSc this.tableId = tableId; } + /** + * Builds the BigTable {@link Scan} for the configured range in table (column-per-field) form. + * + *

The scan covers the start-to-stop row range and adds every Longbow data column from the + * schema under the default column family. + * + * @return the assembled {@link Scan} request + */ @Override public Scan get() { Scan scan = setScanRange(startRow, stopRow); @@ -43,11 +66,22 @@ public Scan get() { return scan; } + /** + * {@inheritDoc} + * + * @return the identifier of the table to scan + */ @Override public String getTableId() { return tableId; } + /** + * Determines whether the given schema column holds Longbow data. + * + * @param c a schema entry mapping a column name to its row index + * @return {@code true} if the column name contains the Longbow data key marker, otherwise {@code false} + */ private boolean isLongbowData(Map.Entry c) { return c.getKey().contains(Constants.LONGBOW_DATA_KEY); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/storage/LongbowStore.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/storage/LongbowStore.java index 59652386e..8b4862ca5 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/storage/LongbowStore.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/storage/LongbowStore.java @@ -25,16 +25,37 @@ * A class that responsible to store the event to big table for longbow. */ public class LongbowStore { + /** + * Admin client used to check for and create BigTable tables. + */ private BigtableTableAdminClient adminClient; + /** + * Asynchronous connection used to read from and write to BigTable tables. + */ private BigtableAsyncConnection tableClient; + /** + * Cache of opened asynchronous tables keyed by table id. + */ private Map> tables; + /** + * Creates a store backed by the given BigTable admin and async connection clients. + * + * @param adminClient the client used to manage BigTable tables + * @param tableClient the asynchronous connection used to access table data + */ private LongbowStore(BigtableTableAdminClient adminClient, BigtableAsyncConnection tableClient) { this.adminClient = adminClient; this.tableClient = tableClient; this.tables = new HashMap<>(); } + /** + * Returns the asynchronous table handle for the given id, opening and caching it on first use. + * + * @param tableId the identifier of the BigTable table + * @return the cached or newly opened {@code AsyncTable} for the table + */ private AsyncTable getTable(String tableId) { if (!tables.containsKey(tableId)) { tables.put(tableId, tableClient.getTable(TableName.valueOf(tableId))); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowType.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowType.java index 126475aef..1ff8a51df 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowType.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowType.java @@ -8,15 +8,43 @@ * The enum Longbow type. */ public enum LongbowType { + /** + * Read-only Longbow that scans previously written BigTable data back into the stream. + */ LongbowRead(LongbowKey.LONGBOW_READ, MandatoryFields.LONGBOW_READ, InvalidFields.LONGBOW_READ), + /** + * Write-only Longbow that persists the input stream into BigTable for later reads. + */ LongbowWrite(LongbowKey.LONGBOW_WRITE, MandatoryFields.LONGBOW_WRITE, InvalidFields.LONGBOW_WRITE), + /** + * Combined Longbow that both writes the input and reads back the configured range in one flow. + */ LongbowProcess(LongbowKey.LONGBOW_PROCESS, MandatoryFields.LONGBOW_PROCESS, InvalidFields.LONGBOW_PROCESS); + /** + * Suffix stripped from the key name to derive the human-readable Longbow type name. + */ private static final String LONGBOW_TYPE_PREFIX = "_key"; + /** + * The schema key whose presence identifies this Longbow type. + */ private String keyName; + /** + * The fields that must be present in the schema for this Longbow type to be valid. + */ private String[] mandatoryFields; + /** + * The fields that must not be present in the schema for this Longbow type to be valid. + */ private String[] invalidFields; + /** + * Instantiates a new Longbow type. + * + * @param keyName the schema key that identifies this Longbow type + * @param mandatoryFields the fields required for this Longbow type + * @param invalidFields the fields disallowed for this Longbow type + */ LongbowType(String keyName, String[] mandatoryFields, String[] invalidFields) { this.keyName = keyName; this.mandatoryFields = mandatoryFields; @@ -59,21 +87,57 @@ public String getTypeName() { return keyName.replace(LONGBOW_TYPE_PREFIX, ""); } + /** + * Holds the mandatory schema fields required by each Longbow type. + */ private static class MandatoryFields { + /** + * Mandatory fields for the combined Longbow process type. + */ private static final String[] LONGBOW_PROCESS = new String[]{Constants.LONGBOW_DATA_KEY, Constants.EVENT_TIMESTAMP, ROWTIME}; + /** + * Mandatory fields for the Longbow write type. + */ private static final String[] LONGBOW_WRITE = new String[]{ROWTIME, Constants.EVENT_TIMESTAMP}; + /** + * Mandatory fields for the Longbow read type. + */ private static final String[] LONGBOW_READ = new String[]{Constants.EVENT_TIMESTAMP}; } + /** + * Holds the identifying schema key for each Longbow type. + */ private static class LongbowKey { + /** + * Schema key identifying the combined Longbow process type. + */ private static final String LONGBOW_PROCESS = "longbow_key"; + /** + * Schema key identifying the Longbow write type. + */ private static final String LONGBOW_WRITE = "longbow_write_key"; + /** + * Schema key identifying the Longbow read type. + */ private static final String LONGBOW_READ = "longbow_read_key"; } + /** + * Holds the disallowed schema fields for each Longbow type. + */ private static class InvalidFields { + /** + * Disallowed fields for the combined Longbow process type. + */ private static final String[] LONGBOW_PROCESS = new String[]{Constants.LONGBOW_PROTO_DATA_KEY}; + /** + * Disallowed fields for the Longbow write type. + */ private static final String[] LONGBOW_WRITE = new String[]{Constants.LONGBOW_PROTO_DATA_KEY, Constants.LONGBOW_DATA_KEY, Constants.LONGBOW_LATEST_KEY, Constants.LONGBOW_EARLIEST_KEY, Constants.LONGBOW_DURATION_KEY}; + /** + * Disallowed fields for the Longbow read type. + */ private static final String[] LONGBOW_READ = new String[]{Constants.LONGBOW_DATA_KEY, Constants.LONGBOW_PROTO_DATA_KEY}; } } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowValidator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowValidator.java index 78e341a43..f0cfdc378 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowValidator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/longbow/validator/LongbowValidator.java @@ -10,6 +10,9 @@ * The Longbow validator. */ public class LongbowValidator { + /** + * The input stream column names validated against each Longbow type's field requirements. + */ private String[] columnNames; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/TelemetryProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/TelemetryProcessor.java index 19ccd9e83..da5bf7b1f 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/TelemetryProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/TelemetryProcessor.java @@ -12,6 +12,7 @@ * The Telemetry processor. */ public class TelemetryProcessor implements PostProcessor { + /** The map function that records and exports telemetry metrics for the stream. */ private MetricsTelemetryExporter metricsTelemetryExporter; /** @@ -23,12 +24,29 @@ public TelemetryProcessor(MetricsTelemetryExporter metricsTelemetryExporter) { this.metricsTelemetryExporter = metricsTelemetryExporter; } + /** + * Attaches the telemetry exporter to the stream as a map function. + * + *

Records pass through unchanged; the exporter side-effects metric registration, while the + * column names are preserved on the returned {@link StreamInfo}. + * + * @param inputStreamInfo the upstream stream together with its column names + * @return a new {@link StreamInfo} wrapping the instrumented stream and the unchanged column names + */ @Override public StreamInfo process(StreamInfo inputStreamInfo) { DataStream resultStream = inputStreamInfo.getDataStream().map(metricsTelemetryExporter); return new StreamInfo(resultStream, inputStreamInfo.getColumnNames()); } + /** + * Indicates whether this post processor is applicable to the supplied configuration. + * + *

Telemetry is always collected, so this processor applies to every configuration. + * + * @param postProcessorConfig the post processor configuration (ignored) + * @return {@code true} always + */ @Override public boolean canProcess(PostProcessorConfig postProcessorConfig) { return true; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/processor/MetricsTelemetryExporter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/processor/MetricsTelemetryExporter.java index 0f293190a..c0acd9fbc 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/processor/MetricsTelemetryExporter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/telemetry/processor/MetricsTelemetryExporter.java @@ -21,9 +21,13 @@ * The Metrics telemetry exporter. */ public class MetricsTelemetryExporter extends RichMapFunction implements TelemetrySubscriber { + /** Logger used to report the metrics registered with the underlying stats manager. */ private static final Logger LOGGER = LoggerFactory.getLogger(MetricsTelemetryExporter.class.getName()); + /** Manages gauge metrics; lazily created from the runtime metric group when not injected. */ private GaugeStatsManager gaugeStatsManager; + /** Constant gauge value reported for every registered telemetry aspect. */ private Integer gaugeValue = 1; + /** Accumulated telemetry, mapping each metric group key to the set of values seen for it. */ private Map> metrics = new HashMap<>(); /** @@ -41,6 +45,15 @@ public MetricsTelemetryExporter(GaugeStatsManager gaugeStatsManager) { public MetricsTelemetryExporter() { } + /** + * Initialises the gauge stats manager and registers any pending metric groups. + * + *

Called by Flink when the function is opened. When no {@link GaugeStatsManager} was injected + * one is created from the runtime metric group, and previously collected metrics are registered. + * + * @param parameters the Flink job/runtime configuration + * @throws Exception if the underlying {@link RichMapFunction} initialisation fails + */ @Override public void open(Configuration parameters) throws Exception { if (gaugeStatsManager == null) { @@ -51,11 +64,29 @@ public void open(Configuration parameters) throws Exception { } } + /** + * Passes each record through unchanged. + * + *

This exporter only collects telemetry as a side effect of stream setup and does not modify + * the data flowing through it. + * + * @param inputRow the incoming record + * @return the same {@code inputRow}, unmodified + * @throws Exception if record handling fails + */ @Override public Row map(Row inputRow) throws Exception { return inputRow; } + /** + * Receives telemetry from a publisher and registers the merged metrics. + * + *

Invoked when a subscribed {@link TelemetryPublisher} announces new telemetry; the values are + * merged into the accumulated metrics and, when a stats manager is available, registered. + * + * @param publisher the publisher whose telemetry is merged into this exporter + */ @Override public void updated(TelemetryPublisher publisher) { mergeMetrics(publisher.getTelemetry()); @@ -64,6 +95,14 @@ public void updated(TelemetryPublisher publisher) { } } + /** + * Merges telemetry from a publisher into the accumulated metric groups. + * + *

Each value is added to the set stored under its group key, creating the set on first use so + * duplicate values are ignored. + * + * @param metricsFromPublisher the per-group metric values reported by a publisher + */ private void mergeMetrics(Map> metricsFromPublisher) { metricsFromPublisher.forEach((key, value) -> { metrics.computeIfAbsent(key, x -> new HashSet<>()).addAll(value); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TableTransformConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TableTransformConfig.java index 11f76b2d3..c59a2ea47 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TableTransformConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TableTransformConfig.java @@ -7,7 +7,13 @@ * A class that holds Table transformer configuration. */ public class TableTransformConfig { + /** + * The name of the input table whose stream these transformers are applied to. + */ protected String tableName; + /** + * The ordered list of {@link TransformConfig} entries applied to the table's stream. + */ protected List transformers; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformConfig.java index 7eb6d9678..fe495fae6 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformConfig.java @@ -11,7 +11,14 @@ */ public class TransformConfig implements Validator, Serializable { + /** + * The fully-qualified class name of the + * {@link com.gotocompany.dagger.common.core.Transformer} implementation to instantiate and apply. + */ private final String transformationClass; + /** + * The arguments supplied to the transformer, keyed by argument name. + */ private final Map transformationArguments; /** @@ -51,12 +58,30 @@ public Map getTransformationArguments() { return transformationArguments; } + /** + * Returns the fields that must be present for this configuration to be considered valid. + * + *

For a transform config the only mandatory field is the transformation class name. + * + * @return a map containing the {@code transformationClass} entry to be validated + */ public HashMap getMandatoryFields() { HashMap mandatoryFields = new HashMap<>(); mandatoryFields.put("transformationClass", transformationClass); return mandatoryFields; } + /** + * Validates this configuration, ensuring mandatory fields are present and that no reserved + * default-argument key is supplied by the user. + * + *

This first runs the default {@link Validator#validateFields()} checks, then rejects any + * transformation argument whose key collides with a {@link TransformerUtils.DefaultArgument} + * (for example {@code table_name}), since those keys are populated internally. + * + * @throws IllegalArgumentException if a mandatory field is missing or a reserved argument key + * is present in the transformation arguments + */ @Override public void validateFields() throws IllegalArgumentException { Validator.super.validateFields(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformProcessor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformProcessor.java index acca01125..760443da3 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformProcessor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformProcessor.java @@ -23,6 +23,9 @@ * The Transformer processor. */ public class TransformProcessor implements Preprocessor, PostProcessor, TelemetryPublisher { + /** + * The ordered list of {@link TransformConfig} entries this processor applies to the stream. + */ protected final List transformConfigs; @@ -35,9 +38,21 @@ public String getTableName() { return tableName; } + /** + * The name of the input table this processor is bound to, or {@code "NULL"} when unscoped. + */ protected final String tableName; + /** + * The telemetry metrics gathered by this processor, keyed by metric type. + */ private final Map> metrics = new HashMap<>(); + /** + * The telemetry classification (pre-processor or post-processor) of this processor. + */ protected final TelemetryTypes type; + /** + * The Dagger context providing access to job configuration and the Flink runtime. + */ private final DaggerContext daggerContext; /** @@ -65,6 +80,18 @@ public TransformProcessor(String tableName, TelemetryTypes type, ListFor each {@link TransformConfig} the declared {@link Transformer} implementation is loaded + * reflectively and its {@code transform} method is invoked, chaining the output of one + * transformer into the input of the next. + * + * @param streamInfo the stream metadata and data stream to transform + * @return the resulting {@link StreamInfo} after all transformers have been applied + * @throws TransformClassNotDefinedException if a configured transformer class cannot be loaded + * or instantiated + */ @Override public StreamInfo process(StreamInfo streamInfo) { for (TransformConfig transformConfig : transformConfigs) { @@ -79,16 +106,37 @@ public StreamInfo process(StreamInfo streamInfo) { return streamInfo; } + /** + * Determines whether this processor should run for the given pre-processor configuration. + * + * @param processorConfig the pre-processor configuration to inspect + * @return {@code true} if the configuration declares a table transformer whose table name + * matches this processor's table name, {@code false} otherwise + */ @Override public boolean canProcess(PreProcessorConfig processorConfig) { return processorConfig.getTableTransformers().stream().anyMatch(x -> x.tableName.equals(this.tableName)); } + /** + * Determines whether this processor should run for the given post-processor configuration. + * + * @param processorConfig the post-processor configuration to inspect + * @return {@code true} if the configuration declares any transform configs, {@code false} + * otherwise + */ @Override public boolean canProcess(PostProcessorConfig processorConfig) { return processorConfig.hasTransformConfigs(); } + /** + * Registers this processor's telemetry metric before telemetry subscribers are notified. + * + *

The metric key recorded depends on the configured {@link TelemetryTypes}: post-processor + * usage is tracked under the generic transform-processor key, while pre-processor usage is + * tracked under a table-scoped key. Other telemetry types record nothing. + */ @Override public void preProcessBeforeNotifyingSubscriber() { switch (this.type) { @@ -103,11 +151,22 @@ public void preProcessBeforeNotifyingSubscriber() { } } + /** + * Returns the telemetry metrics gathered by this processor. + * + * @return a map of metric type to the list of recorded metric values + */ @Override public Map> getTelemetry() { return metrics; } + /** + * Records a single telemetry metric value under the given key, creating the list if needed. + * + * @param key the metric type key to record under + * @param value the metric value to append + */ private void addMetric(String key, String value) { metrics.computeIfAbsent(key, k -> new ArrayList<>()).add(value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformerUtils.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformerUtils.java index bb10c8409..041334ab2 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformerUtils.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/transformers/TransformerUtils.java @@ -12,12 +12,26 @@ enum DefaultArgument { * Table name default argument. */ INPUT_SCHEMA_TABLE("table_name"); + /** + * The serialized argument key used to look this default argument up in a transformer's + * argument map. + */ private final String argument; + /** + * Creates a default argument bound to the given serialized key. + * + * @param argument the argument key as it appears in a transformer's argument map + */ DefaultArgument(String argument) { this.argument = argument; } + /** + * Returns the serialized argument key for this default argument. + * + * @return the argument key + */ @Override public String toString() { return this.argument; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/FilterDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/FilterDecorator.java index 384a21979..d9a58c7b1 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/FilterDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/FilterDecorator.java @@ -9,6 +9,14 @@ */ public interface FilterDecorator extends FilterFunction, StreamDecorator { + /** + * Decorates the given stream by applying this filter to it. + * + *

{@inheritDoc} + * + * @param inputStream the input stream of {@link Row} records to filter + * @return a data stream retaining only the records that satisfy this filter + */ @Override default DataStream decorate(DataStream inputStream) { return inputStream.filter(this); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/MapDecorator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/MapDecorator.java index 5e17a4d29..e717e87fb 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/MapDecorator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/processors/types/MapDecorator.java @@ -9,6 +9,14 @@ */ public interface MapDecorator extends MapFunction, StreamDecorator { + /** + * Decorates the given stream by applying this map function to each record. + * + *

{@inheritDoc} + * + * @param inputStream the input stream of {@link Row} records to map + * @return a data stream with this map function applied to every record + */ @Override default DataStream decorate(DataStream inputStream) { return inputStream.map(this); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/SinkOrchestrator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/SinkOrchestrator.java index f8f418a77..bd479ed1c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/SinkOrchestrator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/SinkOrchestrator.java @@ -37,9 +37,22 @@ * Responsible for handling the sink type. */ public class SinkOrchestrator implements TelemetryPublisher { + /** + * Exporter subscribed to telemetry publishers (such as the Kafka serializer builder) created + * while building a sink. + */ private final MetricsTelemetryExporter telemetryExporter; + /** + * Telemetry gathered during sink construction, keyed by metric type with the list of recorded + * values, exposed through {@link #getTelemetry()}. + */ private final Map> metrics; + /** + * Instantiates a new sink orchestrator. + * + * @param telemetryExporter the exporter that receives telemetry published while sinks are built + */ public SinkOrchestrator(MetricsTelemetryExporter telemetryExporter) { this.telemetryExporter = telemetryExporter; this.metrics = new HashMap<>(); @@ -95,6 +108,13 @@ public Sink getSink(Configuration configuration, String[] columnNames, StencilCl return sink; } + /** + * Subscribes the orchestrator's telemetry exporter to the given Kafka serializer builder so that + * schema-related telemetry produced while serializing is forwarded to the exporter. + * + * @param kafkaSchemaBuilder the Kafka serializer builder, which also acts as a + * {@code TelemetryPublisher} + */ private void reportTelemetry(KafkaSerializerBuilder kafkaSchemaBuilder) { TelemetryPublisher pub = (TelemetryPublisher) kafkaSchemaBuilder; pub.addSubscriber(telemetryExporter); @@ -124,6 +144,12 @@ protected Properties getProducerProperties(Configuration configuration) { return kafkaProducerConfigs; } + /** + * Validates that the configured Kafka producer {@code linger.ms} value is a parseable integer. + * + * @param lingerMs the linger-milliseconds value as a string + * @throws IllegalArgumentException if {@code lingerMs} cannot be parsed as an integer + */ private void validateLingerMs(String lingerMs) { try { Integer.parseInt(lingerMs); @@ -132,11 +158,25 @@ private void validateLingerMs(String lingerMs) { } } + /** + * {@inheritDoc} + * + *

Returns the telemetry accumulated by this orchestrator while building sinks, keyed by metric + * type with the list of recorded values. + * + * @return the telemetry map of metric type to the list of recorded values + */ @Override public Map> getTelemetry() { return metrics; } + /** + * Records a telemetry value under the given key, creating the backing value list on first use. + * + * @param key the metric type key + * @param value the metric value to append for that key + */ private void addMetric(String key, String value) { metrics.computeIfAbsent(key, k -> new ArrayList<>()).add(value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySink.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySink.java index 37d8cc0e5..c02952f64 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySink.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySink.java @@ -24,12 +24,33 @@ import java.util.Optional; import java.util.Set; +/** + * A Flink {@link Sink} implementation that writes Dagger output {@link Row} records to BigQuery. + * + *

Each row is serialized with a {@link ProtoSerializer} and handed to the depot + * {@code BigQuerySinkFactory}, which performs the actual batched writes. This sink does not rely on + * Flink's committer or global-committer mechanism, so all committer-related factory methods return + * {@link Optional#empty()}. + */ public class BigQuerySink implements Sink { + /** Serializes each output {@link Row} into protobuf key and value bytes. */ private final ProtoSerializer protoSerializer; + /** Dagger configuration supplying BigQuery sink settings, batch size and parameters. */ private final Configuration configuration; + /** Reporter used to build the StatsD instrumentation passed to the depot sink factory. */ private final DaggerStatsDReporter daggerStatsDReporter; + /** Lazily-initialized depot factory that creates the underlying BigQuery sink; not serialized. */ private transient BigQuerySinkFactory sinkFactory; + /** + * Instantiates a new BigQuery sink without a pre-built sink factory. + * + *

The {@code BigQuerySinkFactory} is created lazily when the first writer is opened. + * + * @param configuration the Dagger configuration carrying BigQuery sink settings + * @param protoSerializer the serializer that converts rows into protobuf messages + * @param daggerStatsDReporter the StatsD reporter used for sink instrumentation + */ protected BigQuerySink(Configuration configuration, ProtoSerializer protoSerializer, DaggerStatsDReporter daggerStatsDReporter) { this(configuration, protoSerializer, null, daggerStatsDReporter); } @@ -44,6 +65,18 @@ protected BigQuerySink(Configuration configuration, ProtoSerializer protoSeriali this.daggerStatsDReporter = daggerStatsDReporter; } + /** + * {@inheritDoc} + * + *

Lazily initializes the depot {@code BigQuerySinkFactory} on first invocation, resolves the + * configured batch size and the set of {@code ErrorType}s that should fail the job, and returns a + * {@link BigQuerySinkWriter} bound to a freshly created depot sink. Initialization failures are + * reported as fatal exceptions before being rethrown. + * + * @param context the sink initialization context providing the metric group + * @param states the restored writer states; unused because this sink keeps no state + * @return a new {@link BigQuerySinkWriter} that batches and writes rows to BigQuery + */ @Override public SinkWriter createWriter(InitContext context, List states) { ErrorReporter errorReporter = ErrorReporterFactory.getErrorReporter(context.metricGroup(), configuration); @@ -71,26 +104,63 @@ public SinkWriter createWriter(InitContext context, List return new BigQuerySinkWriter(protoSerializer, sink, batchSize, errorReporter, errorTypesForFailing); } + /** + * {@inheritDoc} + * + *

This sink keeps no writer state, so no serializer is provided. + * + * @return an empty {@link Optional} + */ @Override public Optional> getWriterStateSerializer() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

BigQuery writes are committed inline by the writer, so no Flink committer is used. + * + * @return an empty {@link Optional} + * @throws IOException never thrown by this implementation + */ @Override public Optional> createCommitter() throws IOException { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

No global commit phase is required for the BigQuery sink. + * + * @return an empty {@link Optional} + * @throws IOException never thrown by this implementation + */ @Override public Optional> createGlobalCommitter() throws IOException { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

No committables are produced, so no serializer is provided. + * + * @return an empty {@link Optional} + */ @Override public Optional> getCommittableSerializer() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

No global committables are produced, so no serializer is provided. + * + * @return an empty {@link Optional} + */ @Override public Optional> getGlobalCommittableSerializer() { return Optional.empty(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkBuilder.java index 2ae1322f3..30d340862 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkBuilder.java @@ -9,6 +9,15 @@ import java.util.HashMap; import java.util.Map; +/** + * Fluent builder for {@link BigQuerySink}. + * + *

Collects the output column names, the Stencil client orchestrator, the job + * {@link Configuration} and the StatsD reporter, then on {@link #build()} constructs the + * {@link ProtoSerializer} used to encode rows and overlays a set of opinionated default settings + * (Stencil caching/refresh behaviour and BigQuery storage options) before instantiating the sink. + * Obtain an instance through {@link #create()} and chain the {@code set*} methods. + */ public class BigQuerySinkBuilder { private String[] columnNames; @@ -16,13 +25,32 @@ public class BigQuerySinkBuilder { private Configuration configuration; private DaggerStatsDReporter daggerStatsDReporter; + /** + * Creates an empty builder; use {@link #create()} to obtain instances. + */ private BigQuerySinkBuilder() { } + /** + * Creates a new, empty builder. + * + * @return a fresh {@link BigQuerySinkBuilder} + */ public static BigQuerySinkBuilder create() { return new BigQuerySinkBuilder(); } + /** + * Builds the {@link BigQuerySink} from the configured values. + * + *

Constructs a {@link ProtoSerializer} from the configured proto key/message classes + * ({@code SINK_CONNECTOR_SCHEMA_PROTO_KEY_CLASS} and + * {@code SINK_CONNECTOR_SCHEMA_PROTO_MESSAGE_CLASS}), the column names and the Stencil client + * orchestrator, applies the enforced defaults from {@link #setDefaultValues(Configuration)} and + * returns the assembled sink. + * + * @return a configured {@link BigQuerySink} + */ public BigQuerySink build() { ProtoSerializer protoSerializer = new ProtoSerializer( configuration.getString("SINK_CONNECTOR_SCHEMA_PROTO_KEY_CLASS", ""), @@ -33,6 +61,17 @@ public BigQuerySink build() { return new BigQuerySink(conf, protoSerializer, daggerStatsDReporter); } + /** + * Returns a copy of the given configuration with BigQuery-sink defaults overlaid. + * + *

Forces a fixed set of Stencil schema-registry caching, refresh, retry and timeout options + * and enables BigQuery storage-API writes with row-insert-id disabled and a {@code dagger_} + * metrics prefix. These values take precedence over the corresponding keys in the input + * configuration. + * + * @param inputConf the original job configuration whose parameters are copied + * @return a new {@link Configuration} containing the input values plus the enforced defaults + */ private Configuration setDefaultValues(Configuration inputConf) { Map configMap = new HashMap<>(inputConf.getParam().toMap()); configMap.put("SCHEMA_REGISTRY_STENCIL_CACHE_AUTO_REFRESH", "false"); @@ -48,21 +87,45 @@ private Configuration setDefaultValues(Configuration inputConf) { return new Configuration(ParameterTool.fromMap(configMap)); } + /** + * Sets the job configuration that drives proto serialization and sink behaviour. + * + * @param configuration the job configuration + * @return this builder, for chaining + */ public BigQuerySinkBuilder setConfiguration(Configuration configuration) { this.configuration = configuration; return this; } + /** + * Sets the output column names that are mapped onto the protobuf message fields. + * + * @param columnNames the output column names + * @return this builder, for chaining + */ public BigQuerySinkBuilder setColumnNames(String[] columnNames) { this.columnNames = columnNames; return this; } + /** + * Sets the Stencil client orchestrator used to resolve protobuf descriptors during serialization. + * + * @param stencilClientOrchestrator the Stencil client orchestrator + * @return this builder, for chaining + */ public BigQuerySinkBuilder setStencilClientOrchestrator(StencilClientOrchestrator stencilClientOrchestrator) { this.stencilClientOrchestrator = stencilClientOrchestrator; return this; } + /** + * Sets the StatsD reporter forwarded to the Depot BigQuery sink for metrics. + * + * @param daggerStatsDReporter the StatsD reporter + * @return this builder, for chaining + */ public BigQuerySinkBuilder setDaggerStatsDReporter(DaggerStatsDReporter daggerStatsDReporter) { this.daggerStatsDReporter = daggerStatsDReporter; return this; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkWriter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkWriter.java index 5e64a3f93..a828d6bfa 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkWriter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/bigquery/BigQuerySinkWriter.java @@ -22,16 +22,41 @@ import java.util.Collections; import java.util.stream.Collectors; +/** + * Flink {@link SinkWriter} that batches Dagger {@link Row} records and writes them to BigQuery through + * the depot {@link Sink}. + * + *

Rows are serialized with a {@link ProtoSerializer} and buffered until the configured batch size is + * reached, at which point the batch is flushed. Errors returned by the depot sink are reported via an + * {@link ErrorReporter}; any error whose {@code ErrorType} is configured as fatal causes a + * {@link BigQueryWriterException} to be thrown. + */ @Slf4j public class BigQuerySinkWriter implements SinkWriter { + /** Serializes each {@link Row} into protobuf key and value bytes. */ private final ProtoSerializer protoSerializer; + /** The depot BigQuery sink that performs the actual writes. */ private final Sink bigquerySink; + /** Maximum number of messages buffered before a batch is flushed to BigQuery. */ private final int batchSize; + /** Reporter used to surface fatal and non-fatal sink errors as metrics. */ private final ErrorReporter errorReporter; + /** Error types that, when returned by the sink, should fail the job. */ private final Set errorTypesForFailing; + /** Buffer of serialized messages awaiting the next flush to BigQuery. */ private final List messages = new ArrayList<>(); + /** Number of messages currently buffered in {@link #messages}. */ private int currentBatchSize; + /** + * Instantiates a new BigQuery sink writer. + * + * @param protoSerializer the serializer converting rows into protobuf key and value bytes + * @param bigquerySink the depot sink that performs the writes + * @param batchSize the maximum number of messages buffered before flushing + * @param errorReporter the reporter for fatal and non-fatal sink errors + * @param errorTypesForFailing the set of error types that should cause the job to fail + */ public BigQuerySinkWriter(ProtoSerializer protoSerializer, Sink bigquerySink, int batchSize, ErrorReporter errorReporter, Set errorTypesForFailing) { this.protoSerializer = protoSerializer; this.bigquerySink = bigquerySink; @@ -40,6 +65,16 @@ public BigQuerySinkWriter(ProtoSerializer protoSerializer, Sink bigquerySink, in this.errorTypesForFailing = errorTypesForFailing; } + /** + * {@inheritDoc} + * + *

Serializes the given row and adds it to the buffer; once the buffer reaches the configured + * batch size the batch is flushed to BigQuery and the buffer is reset. + * + * @param element the output row to write + * @param context the sink write context + * @throws IOException if serialization or the flush to BigQuery fails + */ @Override public void write(Row element, Context context) throws IOException { log.info("adding row to BQ batch : " + element); @@ -57,6 +92,12 @@ public void write(Row element, Context context) throws IOException { } } + /** + * Pushes the currently buffered messages to the depot BigQuery sink and handles any reported errors. + * + * @throws SinkException if the underlying depot sink fails while pushing the batch + * @throws BigQueryWriterException if the response contains errors whose type is configured as fatal + */ private void pushToBq() throws SinkException, BigQueryWriterException { log.info("Pushing " + currentBatchSize + " records to bq"); SinkResponse sinkResponse; @@ -72,6 +113,13 @@ private void pushToBq() throws SinkException, BigQueryWriterException { } } + /** + * Partitions the errors in the sink response into fatal and non-fatal groups, reports each group to + * the error reporter accordingly, and fails if any fatal error is present. + * + * @param sinkResponse the response returned by the depot sink for the last batch + * @throws BigQueryWriterException if at least one error has a type contained in the fatal error set + */ protected void checkAndThrow(SinkResponse sinkResponse) throws BigQueryWriterException { Map> failedErrorTypes = sinkResponse.getErrors().values().stream().collect( Collectors.partitioningBy(errorInfo -> errorTypesForFailing.contains(errorInfo.getErrorType()))); @@ -86,6 +134,13 @@ protected void checkAndThrow(SinkResponse sinkResponse) throws BigQueryWriterExc } } + /** + * Logs detailed information for every error in the sink response, correlating each error with the + * message that produced it. + * + * @param sinkResponse the response containing the per-message errors + * @param sentMessages the messages sent in the failed batch, indexed by their position + */ protected void logErrors(SinkResponse sinkResponse, List sentMessages) { log.error("Failed to push " + sinkResponse.getErrors().size() + " records to BigQuerySink"); sinkResponse.getErrors().forEach((index, errorInfo) -> { @@ -112,11 +167,26 @@ public List prepareCommit(boolean flush) throws IOException { return Collections.emptyList(); } + /** + * {@inheritDoc} + * + *

Closes the underlying depot BigQuery sink and releases its resources. + * + * @throws Exception if the underlying sink fails to close + */ @Override public void close() throws Exception { bigquerySink.close(); } + /** + * {@inheritDoc} + * + *

This writer holds no checkpointable state, so an empty list is returned. + * + * @param checkpointId the id of the checkpoint being taken + * @return an empty list + */ @Override public List snapshotState(long checkpointId) { // We don't snapshot anything diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/ErrorHandler.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/ErrorHandler.java index 4f00d1964..409b21d32 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/ErrorHandler.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/ErrorHandler.java @@ -19,8 +19,15 @@ * The Error handler for Influx sink. */ public class ErrorHandler implements Serializable { + /** + * Handler invoked for a batch of points that failed to write; it selects the matching + * {@link InfluxError} strategy and delegates handling to it. + */ private BiConsumer, Throwable> exceptionHandler; + /** + * The {@link InfluxError} strategy matched for the most recent failure, if any. + */ private InfluxError error; /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBSink.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBSink.java index 645992a5c..30b62a8e1 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBSink.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBSink.java @@ -19,6 +19,17 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +/** + * Flink {@link Sink} implementation that writes Dagger output {@link Row} records to InfluxDB. + * + *

This is Dagger's default output sink. When a writer is created it opens a batched + * {@link InfluxDB} connection (using the configured URL and credentials), wires in an + * {@link ErrorHandler} to classify the connection's asynchronous batch-write failures, and + * delegates per-row point construction and writing to an {@link InfluxDBWriter}. Optional + * {@link InfluxSinkOverrides} let a job override the measurement name and retention policy per sink. + * Like the other Dagger sinks it is stateless, exposing no committer, global committer or state + * serializer (all return {@link Optional#empty()}). + */ public class InfluxDBSink implements Sink { private InfluxDBFactoryWrapper influxDBFactory; private Configuration configuration; @@ -27,6 +38,19 @@ public class InfluxDBSink implements Sink { private ErrorReporter errorReporter; private final InfluxSinkOverrides overrides; + /** + * Creates an InfluxDB sink. + * + * @param influxDBFactory the factory used to open the {@link InfluxDB} connection when a writer + * is created + * @param configuration the job configuration supplying the Influx URL, credentials, database + * name, batching and measurement settings + * @param columnNames the output column names used to map row fields to Influx tags/fields + * when row field names are not used + * @param errorHandler the handler that captures and classifies asynchronous batch-write errors + * @param overrides optional measurement-name/retention-policy overrides; when {@code null} + * it falls back to {@link InfluxSinkOverrides#none()} + */ public InfluxDBSink(InfluxDBFactoryWrapper influxDBFactory, Configuration configuration, String[] columnNames, ErrorHandler errorHandler, InfluxSinkOverrides overrides) { this.influxDBFactory = influxDBFactory; @@ -36,6 +60,20 @@ public InfluxDBSink(InfluxDBFactoryWrapper influxDBFactory, Configuration config this.overrides = overrides == null ? InfluxSinkOverrides.none() : overrides; } + /** + * {@inheritDoc} + * + *

Opens an {@link InfluxDB} connection from the configured URL, username and password, + * initializes the {@link ErrorHandler} with the sink context, and enables client-side batching + * (batch size and flush interval from configuration) so failures are dispatched to the handler's + * exception callback. An {@link ErrorReporter} is created lazily from the metric group, and the + * method returns an {@link InfluxDBWriter} bound to these collaborators and any overrides. + * + * @param context the Flink sink init context, used to initialize the error handler and metrics + * @param states the previously checkpointed writer states; ignored because this sink keeps no state + * @return a new {@link InfluxDBWriter} + * @throws IOException if the writer cannot be created + */ @Override public SinkWriter createWriter(InitContext context, List states) throws IOException { InfluxDB influxDB = influxDBFactory.connect(configuration.getString(Constants.SINK_INFLUX_URL_KEY, Constants.SINK_INFLUX_URL_DEFAULT), @@ -53,26 +91,52 @@ public SinkWriter createWriter(InitContext context, List return influxDBWriter; } + /** + * {@inheritDoc} + * + *

Returns {@link Optional#empty()} because this sink keeps no writer state to checkpoint. + */ @Override public Optional> getWriterStateSerializer() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

Returns {@link Optional#empty()}; durability is handled by the batched InfluxDB client, so + * no Flink committer is required. + */ @Override public Optional> createCommitter() throws IOException { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

Returns {@link Optional#empty()}; this sink has no global commit phase. + */ @Override public Optional> createGlobalCommitter() throws IOException { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

Returns {@link Optional#empty()} because there are no committables to serialize. + */ @Override public Optional> getCommittableSerializer() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

Returns {@link Optional#empty()} because there are no global committables to serialize. + */ @Override public Optional> getGlobalCommittableSerializer() { return Optional.empty(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBWriter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBWriter.java index f130e71c2..60a068cef 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBWriter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxDBWriter.java @@ -25,18 +25,51 @@ import java.util.Set; import java.util.concurrent.TimeUnit; +/** + * Flink {@link SinkWriter} that converts each {@link Row} into an InfluxDB {@link Point} and writes + * it to the configured database through a batched {@link InfluxDB} client. + * + *

Columns (or, when {@code SINK_INFLUX_USING_ROW_FIELD_NAMES} is enabled, the row's own field + * names) are mapped onto a point as follows: a {@code window_timestamp} column becomes the point's + * timestamp (interpreted as UTC), columns prefixed with {@code tag_} become Influx tags, columns + * prefixed with {@code label_} become tags with the prefix stripped, and any other non-null column + * becomes a field. Because the underlying client batches and flushes asynchronously, write failures + * are surfaced through the shared {@link ErrorHandler} and re-checked on every write and at + * checkpoint time; fatal errors are reported via the {@link ErrorReporter} and rethrown to fail the + * job. The measurement name and retention policy come from {@link InfluxSinkOverrides} when set, + * otherwise from configuration. + */ public class InfluxDBWriter implements SinkWriter { private static final Logger LOGGER = LoggerFactory.getLogger(InfluxDBWriter.class.getName()); + /** Target InfluxDB database, from {@code SINK_INFLUX_DB_NAME}. */ private final String databaseName; + /** Retention policy applied to writes; taken from overrides when present, otherwise configuration. */ private final String retentionPolicy; + /** Influx measurement name; taken from overrides when present, otherwise configuration. */ private final String measurementName; private InfluxDB influxDB; private String[] columnNames; private ErrorHandler errorHandler; private ErrorReporter errorReporter; + /** When {@code true}, points are built from the row's field names instead of the configured column names. */ private boolean useRowFieldNames; + /** + * Creates an InfluxDB writer, resolving its target database, retention policy and measurement name. + * + *

The database name and the {@code useRowFieldNames} flag are taken from configuration. The + * retention policy and measurement name are taken from {@code overrides} when present, otherwise + * from configuration. + * + * @param configuration the job configuration supplying database name, retention policy, + * measurement name and the row-field-name flag defaults + * @param influxDB the batched InfluxDB client used to write points + * @param columnNames the output column names used to build points when row field names are not used + * @param errorHandler the handler that captures asynchronous batch-write failures + * @param errorReporter the reporter used to surface fatal write failures as metrics + * @param overrides optional measurement-name/retention-policy overrides + */ public InfluxDBWriter(Configuration configuration, InfluxDB influxDB, String[] columnNames, ErrorHandler errorHandler, ErrorReporter errorReporter, InfluxSinkOverrides overrides) { databaseName = configuration.getString(Constants.SINK_INFLUX_DB_NAME_KEY, Constants.SINK_INFLUX_DB_NAME_DEFAULT); @@ -53,6 +86,19 @@ public InfluxDBWriter(Configuration configuration, InfluxDB influxDB, String[] c this.errorReporter = errorReporter; } + /** + * {@inheritDoc} + * + *

Builds an Influx {@link Point} from the row (using the row's field names or the configured + * column names depending on {@code useRowFieldNames}), checks for any pending asynchronous batch + * error and rethrows it, then writes the point to the configured database and retention policy. A + * synchronous write failure is reported as fatal and rethrown. + * + * @param row the output row to write + * @param context the Flink writer context (unused) + * @throws IOException if a pending asynchronous batch error is detected or the write fails + * @throws InterruptedException if interrupted while writing (declared by the {@link SinkWriter} contract) + */ @Override public void write(Row row, Context context) throws IOException, InterruptedException { LOGGER.info("row to influx: " + row); @@ -75,6 +121,18 @@ public void write(Row row, Context context) throws IOException, InterruptedExcep } } + /** + * Builds an Influx {@link Point} builder from the row using the configured column names. + * + *

For each column: {@code window_timestamp} sets the point time (interpreted as UTC), columns + * prefixed with {@code tag_} are added as tags, columns prefixed with {@code label_} are added as + * tags with the prefix removed, and any remaining column whose value is non-null and non-empty is + * collected into {@code fields}. + * + * @param row the row whose field values are read positionally by column index + * @param fields the mutable map that collects the point's fields, populated as a side effect + * @return the point builder with measurement, time and tags applied + */ private Builder writeUsingColumnNames(Row row, Map fields) { Builder pointBuilder = Point.measurement(measurementName); @@ -97,6 +155,19 @@ private Builder writeUsingColumnNames(Row row, Map fields) { return pointBuilder; } + /** + * Builds an Influx {@link Point} builder from the row using the row's own field names. + * + *

Behaves like {@link #writeUsingColumnNames(Row, Map)} but iterates the row's declared field + * names: a {@code window_timestamp} field sets the point time, {@code tag_} and {@code label_} + * prefixes map to tags (the latter with the prefix stripped), and the remaining non-null fields + * become point fields. + * + * @param row the row whose fields are read by name + * @param fields the mutable map that collects the point's fields, populated as a side effect + * @return the point builder with measurement, time and tags applied + * @throws NullPointerException if the row reports no field names + */ private Builder writeUsingRowFieldNames(Row row, Map fields) { Builder pointBuilder = Point.measurement(measurementName); @@ -121,17 +192,44 @@ private Builder writeUsingRowFieldNames(Row row, Map fields) { return pointBuilder; } + /** + * {@inheritDoc} + * + *

This writer produces no committables and relies on the InfluxDB client's own batching, so it + * performs no work here and returns {@code null}. + * + * @param flush whether Flink is requesting a flush of un-staged data (ignored) + * @return {@code null}, as there is nothing to commit + * @throws IOException declared by the {@link SinkWriter} contract; not thrown here + * @throws InterruptedException declared by the {@link SinkWriter} contract; not thrown here + */ @Override public List prepareCommit(boolean flush) throws IOException, InterruptedException { return null; } + /** + * {@inheritDoc} + * + *

Closes the underlying {@link InfluxDB} client, flushing and releasing its resources. + * + * @throws Exception if the client fails to close cleanly + */ @Override public void close() throws Exception { influxDB.close(); } + /** + * Surfaces any asynchronous batch-write failure captured by the {@link ErrorHandler}. + * + *

If the handler currently holds an error that carries an exception, that exception is reported + * as fatal and rethrown, so the failure propagates to Flink instead of being silently swallowed by + * the asynchronous batch writer. + * + * @throws IOException the captured asynchronous write exception, when one is present + */ private void addErrorMetricsAndThrow() throws IOException { if (errorHandler.getError().isPresent() && errorHandler.getError().get().hasException()) { IOException currentException = errorHandler.getError().get().getCurrentException(); @@ -140,6 +238,18 @@ private void addErrorMetricsAndThrow() throws IOException { } } + /** + * {@inheritDoc} + * + *

Forces a synchronous flush of the InfluxDB client's batch so buffered points are durably + * written before the checkpoint completes. Pending asynchronous errors are re-checked both before + * and after the flush and rethrown if present; a flush failure is reported as fatal and rethrown. + * No writer state is produced, so an empty list is returned. + * + * @param checkpointId the id of the checkpoint being taken (unused) + * @return an empty list of writer state + * @throws IOException if a pending asynchronous error is detected or the flush fails + */ @Override public List snapshotState(long checkpointId) throws IOException { addErrorMetricsAndThrow(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxSinkOverrides.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxSinkOverrides.java index 7cfa6bb52..35c09751f 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxSinkOverrides.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/InfluxSinkOverrides.java @@ -46,18 +46,42 @@ public static InfluxSinkOverrides none() { return NONE; } + /** + * Creates overrides that set only the measurement name. + * + * @param measurementName the measurement name to use; {@code null}/blank means fall back to configuration + * @return overrides carrying the given measurement name and no retention-policy override + */ public static InfluxSinkOverrides withMeasurementName(String measurementName) { return new InfluxSinkOverrides(measurementName, null); } + /** + * Creates overrides that set only the retention policy. + * + * @param retentionPolicy the retention policy to use; {@code null}/blank means fall back to configuration + * @return overrides carrying the given retention policy and no measurement-name override + */ public static InfluxSinkOverrides withRetentionPolicy(String retentionPolicy) { return new InfluxSinkOverrides(null, retentionPolicy); } + /** + * Creates overrides that set both the measurement name and the retention policy. + * + * @param measurementName the measurement name to use; {@code null}/blank means fall back to configuration + * @param retentionPolicy the retention policy to use; {@code null}/blank means fall back to configuration + * @return overrides carrying both values + */ public static InfluxSinkOverrides of(String measurementName, String retentionPolicy) { return new InfluxSinkOverrides(measurementName, retentionPolicy); } + /** + * Returns a fluent {@link Builder} for assembling overrides. + * + * @return a new builder + */ public static Builder builder() { return new Builder(); } @@ -72,14 +96,29 @@ public String getRetentionPolicy() { return retentionPolicy; } + /** + * Indicates whether a usable measurement-name override is present. + * + * @return {@code true} if the measurement name is non-null and non-empty + */ public boolean hasMeasurementName() { return !Strings.isNullOrEmpty(measurementName); } + /** + * Indicates whether a usable retention-policy override is present. + * + * @return {@code true} if the retention policy is non-null and non-empty + */ public boolean hasRetentionPolicy() { return !Strings.isNullOrEmpty(retentionPolicy); } + /** + * {@inheritDoc} + * + *

Two instances are equal when both the measurement name and the retention policy are equal. + */ @Override public boolean equals(Object o) { if (this == o) { @@ -93,11 +132,21 @@ public boolean equals(Object o) { && Objects.equals(retentionPolicy, that.retentionPolicy); } + /** + * {@inheritDoc} + * + *

Derived from the measurement name and the retention policy. + */ @Override public int hashCode() { return Objects.hash(measurementName, retentionPolicy); } + /** + * {@inheritDoc} + * + *

Renders the measurement name and retention policy for debugging and logging. + */ @Override public String toString() { return "InfluxSinkOverrides{measurementName='" + measurementName @@ -109,19 +158,40 @@ public static final class Builder { private String measurementName; private String retentionPolicy; + /** + * Creates an empty builder; use {@link InfluxSinkOverrides#builder()} to obtain instances. + */ private Builder() { } + /** + * Sets the measurement-name override. + * + * @param name the measurement name; {@code null}/blank means fall back to configuration + * @return this builder, for chaining + */ public Builder measurementName(String name) { this.measurementName = name; return this; } + /** + * Sets the retention-policy override. + * + * @param policy the retention policy; {@code null}/blank means fall back to configuration + * @return this builder, for chaining + */ public Builder retentionPolicy(String policy) { this.retentionPolicy = policy; return this; } + /** + * Builds the overrides, returning the shared {@link InfluxSinkOverrides#none()} instance when + * neither value was set. + * + * @return the assembled overrides + */ public InfluxSinkOverrides build() { if (measurementName == null && retentionPolicy == null) { return NONE; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/LateRecordDropError.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/LateRecordDropError.java index 86adf8b1f..c207ea80e 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/LateRecordDropError.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/LateRecordDropError.java @@ -17,9 +17,13 @@ * The Late record drop error. */ public class LateRecordDropError implements InfluxError { + /** Flink counter tracking the number of late records dropped by InfluxDB. */ private final Counter counter; + /** Logger used to record dropped points and counts for this error strategy. */ private static final Logger LOGGER = LoggerFactory.getLogger(LateRecordDropError.class.getName()); + /** Reporter used to publish the dropped-record error as a non-fatal exception. */ private ErrorReporter errorStatsReporter; + /** Message prefix that identifies an InfluxDB retention-policy late-drop error. */ private static final String PREFIX = "{\"error\":\"partial write: points beyond retention policy dropped="; /** @@ -34,21 +38,53 @@ public LateRecordDropError(InitContext initContext) { Constants.METRIC_TELEMETRY_SHUTDOWN_PERIOD_MS_DEFAULT); } + /** + * {@inheritDoc} + * + *

A dropped late record is not treated as an exception, so this always returns {@code false}. + * + * @return {@code false} always + */ @Override public boolean hasException() { return false; } + /** + * {@inheritDoc} + * + *

No exception is associated with dropped late records, so this always returns {@code null}. + * + * @return {@code null} always + */ @Override public IOException getCurrentException() { return null; } + /** + * {@inheritDoc} + * + *

Matches when the throwable is an {@code InfluxDBException} reporting that points beyond the + * retention policy were dropped. + * + * @param throwable the throwable raised while writing to InfluxDB + * @return {@code true} if the throwable represents a late-record drop, {@code false} otherwise + */ @Override public boolean filterError(Throwable throwable) { return isLateDropping(throwable); } + /** + * {@inheritDoc} + * + *

Increments the dropped-records counter by the parsed count, reports the throwable as a + * non-fatal exception, and logs the failed points. + * + * @param points the points that failed to be written + * @param throwable the late-record drop error describing how many points were dropped + */ @Override public void handle(Iterable points, Throwable throwable) { reportDroppedPoints(parseDroppedPointsCount(throwable)); @@ -56,16 +92,33 @@ public void handle(Iterable points, Throwable throwable) { logFailedPoints(points, LOGGER); } + /** + * Increments the dropped-records counter and logs the number of dropped points. + * + * @param numPoints the number of points that were dropped + */ private void reportDroppedPoints(int numPoints) { counter.inc(numPoints); LOGGER.warn("Numbers of Points Dropped :" + numPoints); } + /** + * Extracts the number of dropped points encoded in the InfluxDB error message. + * + * @param throwable the throwable whose message encodes the dropped-point count after an {@code =} + * @return the parsed number of dropped points + */ private int parseDroppedPointsCount(Throwable throwable) { String[] split = throwable.getMessage().split("="); return Integer.parseInt(split[1].trim().replace("\"}", "")); } + /** + * Determines whether the given throwable is an InfluxDB retention-policy late-drop error. + * + * @param throwable the throwable to inspect + * @return {@code true} if it is an {@code InfluxDBException} whose message starts with the drop prefix + */ private boolean isLateDropping(Throwable throwable) { return throwable instanceof InfluxDBException && throwable.getMessage().startsWith(PREFIX); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/NoError.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/NoError.java index 875fc9832..05a91ca57 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/NoError.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/NoError.java @@ -8,21 +8,51 @@ * No error found on Influx sink. */ public class NoError implements InfluxError { + /** + * {@inheritDoc} + * + *

This no-op strategy never carries an exception, so this always returns {@code false}. + * + * @return {@code false} always + */ @Override public boolean hasException() { return false; } + /** + * {@inheritDoc} + * + *

There is never an associated exception, so this always returns {@code null}. + * + * @return {@code null} always + */ @Override public IOException getCurrentException() { return null; } + /** + * {@inheritDoc} + * + *

This fallback strategy matches nothing, so it always returns {@code false}. + * + * @param throwable the throwable raised while writing to InfluxDB + * @return {@code false} always + */ @Override public boolean filterError(Throwable throwable) { return false; } + /** + * {@inheritDoc} + * + *

No action is taken because there is no error to handle. + * + * @param points the points associated with the (absent) error + * @param throwable the throwable, which is ignored + */ @Override public void handle(Iterable points, Throwable throwable) { } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidError.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidError.java index 05f5b8e77..0e913f329 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidError.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidError.java @@ -13,24 +13,56 @@ */ public class ValidError implements InfluxError { + /** Logger used to record the points that failed to be written. */ private static final Logger LOGGER = LoggerFactory.getLogger(InfluxDBSink.class.getName()); + /** The wrapped write error captured for the most recently handled failure. */ private IOException error; + /** + * {@inheritDoc} + * + *

This strategy always represents an error condition, so it returns {@code true}. + * + * @return {@code true} always + */ @Override public boolean hasException() { return true; } + /** + * {@inheritDoc} + * + *

Returns the error captured by the most recent call to {@link #handle}. + * + * @return the captured {@link IOException}, or {@code null} if no error has been handled yet + */ @Override public IOException getCurrentException() { return error; } + /** + * {@inheritDoc} + * + *

Matches when the throwable is a JVM {@link Error}. + * + * @param throwable the throwable raised while writing to InfluxDB + * @return {@code true} if the throwable is an {@link Error}, {@code false} otherwise + */ @Override public boolean filterError(Throwable throwable) { return throwable instanceof Error; } + /** + * {@inheritDoc} + * + *

Wraps the throwable in an {@code InfluxWriteException} and logs the failed points. + * + * @param points the points that failed to be written + * @param throwable the error that occurred while writing + */ @Override public void handle(Iterable points, Throwable throwable) { error = new InfluxWriteException(throwable); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidException.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidException.java index 3c8bed822..3d4ebe937 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidException.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/influx/errors/ValidException.java @@ -11,24 +11,56 @@ * The Valid exception. */ public class ValidException implements InfluxError { + /** Logger used to record the points that failed to be written. */ private static final Logger LOGGER = LoggerFactory.getLogger(ValidException.class.getName()); + /** The wrapped write exception captured for the most recently handled failure. */ private IOException exception; + /** + * {@inheritDoc} + * + *

This strategy always represents an error condition, so it returns {@code true}. + * + * @return {@code true} always + */ @Override public boolean hasException() { return true; } + /** + * {@inheritDoc} + * + *

Returns the exception captured by the most recent call to {@link #handle}. + * + * @return the captured {@link IOException}, or {@code null} if no error has been handled yet + */ @Override public IOException getCurrentException() { return exception; } + /** + * {@inheritDoc} + * + *

Matches when the throwable is an {@link Exception}. + * + * @param throwable the throwable raised while writing to InfluxDB + * @return {@code true} if the throwable is an {@link Exception}, {@code false} otherwise + */ @Override public boolean filterError(Throwable throwable) { return throwable instanceof Exception; } + /** + * {@inheritDoc} + * + *

Wraps the throwable in an {@code InfluxWriteException} and logs the failed points. + * + * @param points the points that failed to be written + * @param throwable the exception that occurred while writing + */ @Override public void handle(Iterable points, Throwable throwable) { exception = new InfluxWriteException(throwable); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializationSchemaFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializationSchemaFactory.java index 6462800d0..f8a0de840 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializationSchemaFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializationSchemaFactory.java @@ -7,7 +7,27 @@ import com.gotocompany.dagger.core.sink.kafka.builder.KafkaProtoSerializerBuilder; import com.gotocompany.dagger.core.utils.Constants; +/** + * Factory that selects the appropriate {@link KafkaSerializerBuilder} for the configured Kafka sink + * output encoding. + * + *

It reads {@code SINK_KAFKA_DATA_TYPE} (defaulting to {@code PROTO}) and returns a + * {@link KafkaJsonSerializerBuilder} for {@code JSON} output or a {@link KafkaProtoSerializerBuilder} + * otherwise, so callers such as the {@code SinkOrchestrator} can obtain a serializer without knowing + * the concrete encoding. + */ public class KafkaSerializationSchemaFactory { + /** + * Returns the serializer builder matching the configured Kafka sink output data type. + * + * @param configuration the job configuration; its {@code SINK_KAFKA_DATA_TYPE} selects + * the encoding (defaults to {@code PROTO}) + * @param stencilClientOrchestrator the Stencil client orchestrator used by the protobuf builder to + * resolve descriptors (unused for JSON output) + * @param columnNames the output column names mapped onto the serialized record + * @return a {@link KafkaJsonSerializerBuilder} for {@code JSON}, otherwise a + * {@link KafkaProtoSerializerBuilder} + */ public static KafkaSerializerBuilder getSerializationSchema(Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator, String[] columnNames) { DataTypes dataTypes = DataTypes.valueOf(configuration.getString(Constants.SINK_KAFKA_DATA_TYPE, "PROTO")); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializerBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializerBuilder.java index eab94a916..249838be4 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializerBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/KafkaSerializerBuilder.java @@ -2,6 +2,21 @@ import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema; +/** + * Builds the Flink {@link KafkaRecordSerializationSchema} that serializes output rows before they are + * produced to the Kafka sink topic. + * + *

Each implementation encapsulates a particular output encoding: {@code KafkaProtoSerializerBuilder} + * for protobuf and {@code KafkaJsonSerializerBuilder} for JSON. The concrete builder is chosen by + * {@code KafkaSerializationSchemaFactory} from the configured sink data type, and implementations also + * publish output topic/proto/stream telemetry while building. + */ public interface KafkaSerializerBuilder { + /** + * Builds the Kafka record serialization schema for the configured output topic and encoding. + * + * @return the {@link KafkaRecordSerializationSchema} that maps each output row to a Kafka + * producer record + */ KafkaRecordSerializationSchema build(); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaJsonSerializerBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaJsonSerializerBuilder.java index 73e5399cd..51cd320b8 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaJsonSerializerBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaJsonSerializerBuilder.java @@ -18,20 +18,52 @@ import java.util.List; import java.util.Map; +/** + * {@link KafkaSerializerBuilder} that produces a JSON {@link KafkaRecordSerializationSchema} for the + * Kafka sink. + * + *

On {@link #build()} it reads the output topic, stream and JSON schema from configuration, records + * the topic and stream as telemetry, converts the JSON schema into a Flink {@code TypeInformation} + * and builds a {@link JsonRowSerializationSchema} wrapped in a record serialization schema targeting + * the output topic. As a {@link TelemetryPublisher} it exposes the collected metrics through + * {@link #getTelemetry()}. + */ public class KafkaJsonSerializerBuilder implements KafkaSerializerBuilder, TelemetryPublisher { + /** Collected telemetry (output topic and stream) keyed by telemetry type. */ private Map> metrics; private Configuration configuration; + /** + * Creates a JSON serializer builder. + * + * @param configuration the job configuration providing the output topic, stream and JSON schema + */ public KafkaJsonSerializerBuilder(Configuration configuration) { this.configuration = configuration; this.metrics = new HashMap<>(); } + /** + * {@inheritDoc} + * + *

Returns the output topic and stream recorded during {@link #build()}. + */ @Override public Map> getTelemetry() { return metrics; } + /** + * {@inheritDoc} + * + *

Reads the output topic, stream and JSON schema from configuration, records the topic and + * stream as telemetry and notifies subscribers, converts the JSON schema into a Flink row type, and + * returns a {@link KafkaRecordSerializationSchema} that serializes each row as JSON to the output + * topic. + * + * @return a JSON-based {@link KafkaRecordSerializationSchema} for the output topic + * @throws InvalidJSONSchemaException if the configured JSON schema is invalid and cannot be converted + */ @Override public KafkaRecordSerializationSchema build() { String outputTopic = configuration.getString(Constants.SINK_KAFKA_TOPIC_KEY, ""); @@ -57,6 +89,12 @@ public KafkaRecordSerializationSchema build() { } } + /** + * Appends a telemetry value under the given key, creating the backing list on first use. + * + * @param key the telemetry key + * @param value the telemetry value to record + */ private void addMetric(String key, String value) { metrics.computeIfAbsent(key, k -> new ArrayList<>()).add(value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaProtoSerializerBuilder.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaProtoSerializerBuilder.java index b97cf7e71..91e9ac71a 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaProtoSerializerBuilder.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/kafka/builder/KafkaProtoSerializerBuilder.java @@ -16,12 +16,30 @@ import java.util.List; import java.util.Map; +/** + * {@link KafkaSerializerBuilder} that produces a protobuf {@link KafkaRecordSerializationSchema} for + * the Kafka sink. + * + *

On {@link #build()} it reads the output topic, proto key/message classes and stream from + * configuration, records them as telemetry, and wraps a {@link ProtoSerializer} (which resolves + * protobuf descriptors through the {@link StencilClientOrchestrator}) in a {@link KafkaProtoSerializer} + * bound to the output topic. As a {@link TelemetryPublisher} it exposes the collected output + * topic/proto/stream metrics through {@link #getTelemetry()}. + */ public class KafkaProtoSerializerBuilder implements KafkaSerializerBuilder, TelemetryPublisher { + /** Collected telemetry (output topic, proto message and stream) keyed by telemetry type. */ private Map> metrics; private Configuration configuration; private StencilClientOrchestrator stencilClientOrchestrator; private String[] columnNames; + /** + * Creates a protobuf serializer builder. + * + * @param configuration the job configuration providing the output topic and proto settings + * @param stencilClientOrchestrator the Stencil client orchestrator used to resolve protobuf descriptors + * @param columnNames the output column names mapped onto the protobuf message fields + */ public KafkaProtoSerializerBuilder(Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator, String[] columnNames) { this.configuration = configuration; this.stencilClientOrchestrator = stencilClientOrchestrator; @@ -29,6 +47,15 @@ public KafkaProtoSerializerBuilder(Configuration configuration, StencilClientOrc this.metrics = new HashMap<>(); } + /** + * {@inheritDoc} + * + *

Reads the output topic, proto key/message classes and stream from configuration, records them + * as telemetry and notifies subscribers, then returns a {@link KafkaProtoSerializer} that encodes + * each row through a {@link ProtoSerializer} and targets the configured output topic. + * + * @return a protobuf-based {@link KafkaRecordSerializationSchema} for the output topic + */ @Override public KafkaRecordSerializationSchema build() { String outputTopic = configuration.getString(Constants.SINK_KAFKA_TOPIC_KEY, ""); @@ -44,11 +71,22 @@ public KafkaRecordSerializationSchema build() { return new KafkaProtoSerializer(protoSerializer, outputTopic); } + /** + * {@inheritDoc} + * + *

Returns the output topic, proto message and stream recorded during {@link #build()}. + */ @Override public Map> getTelemetry() { return metrics; } + /** + * Appends a telemetry value under the given key, creating the backing list on first use. + * + * @param key the telemetry key + * @param value the telemetry value to record + */ private void addMetric(String key, String value) { metrics.computeIfAbsent(key, k -> new ArrayList<>()).add(value); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSink.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSink.java index 60a5c5537..22b922946 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSink.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSink.java @@ -14,6 +14,7 @@ * The Log sink. */ public class LogSink implements Sink { + /** The output column names used to label each row value when it is logged. */ private final String[] columnNames; /** @@ -25,31 +26,75 @@ public LogSink(String[] columnNames) { this.columnNames = columnNames; } + /** + * {@inheritDoc} + * + *

Creates a {@link LogSinkWriter} that logs each row using the configured column names. + * + * @param context the sink initialization context + * @param states the restored writer states; unused because this sink keeps no state + * @return a new {@link LogSinkWriter} + */ @Override public SinkWriter createWriter(InitContext context, List states) { return new LogSinkWriter(columnNames); } + /** + * {@inheritDoc} + * + *

This sink keeps no writer state, so no serializer is provided. + * + * @return an empty {@link Optional} + */ @Override public Optional> getWriterStateSerializer() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

The log sink writes directly and uses no committer. + * + * @return an empty {@link Optional} + */ @Override public Optional> createCommitter() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

The log sink requires no global commit phase. + * + * @return an empty {@link Optional} + */ @Override public Optional> createGlobalCommitter() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

No committables are produced, so no serializer is provided. + * + * @return an empty {@link Optional} + */ @Override public Optional> getCommittableSerializer() { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

No global committables are produced, so no serializer is provided. + * + * @return an empty {@link Optional} + */ @Override public Optional> getGlobalCommittableSerializer() { return Optional.empty(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSinkWriter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSinkWriter.java index c643e4dcc..de3ef6de5 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSinkWriter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/sink/log/LogSinkWriter.java @@ -10,14 +10,35 @@ import java.util.List; import java.util.Map; +/** + * Flink {@link SinkWriter} that logs each output {@link Row} as a column-name-to-value map at info + * level. + * + *

Created by {@link LogSink}, it is intended for local development and debugging rather than for + * writing to an external system. It buffers nothing, produces no committables and holds no state. + */ public class LogSinkWriter implements SinkWriter { private static final Logger LOGGER = LoggerFactory.getLogger(LogSinkWriter.class.getName()); private final String[] columnNames; + /** + * Creates a log sink writer. + * + * @param columnNames the output column names used to label each row field in the logged map + */ public LogSinkWriter(String[] columnNames) { this.columnNames = columnNames; } + /** + * {@inheritDoc} + * + *

Builds a map from each non-null row field to its string value, keyed by the corresponding + * column name, and logs it at info level. + * + * @param row the output row to log + * @param context the Flink writer context (unused) + */ @Override public void write(Row row, Context context) { Map map = new HashMap<>(); @@ -30,11 +51,26 @@ public void write(Row row, Context context) { LOGGER.info(map.toString()); } + /** + * {@inheritDoc} + * + *

This writer produces no committables, so it does no work and returns {@code null}. + * + * @param flush whether Flink is requesting a flush of un-staged data (ignored) + * @return {@code null}, as there is nothing to commit + */ @Override public List prepareCommit(boolean flush) { return null; } + /** + * {@inheritDoc} + * + *

No resources are held, so this is a no-op. + * + * @throws Exception declared by the {@link SinkWriter} contract; not thrown here + */ @Override public void close() throws Exception { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSource.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSource.java index 13980f761..6a9d4d563 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSource.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSource.java @@ -4,8 +4,40 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +/** + * Abstraction over a concrete data source that can feed records into a Dagger Flink job. + * + *

Each implementation wraps a specific Flink source connector — for example the modern + * {@code KafkaSource}, the legacy {@code FlinkKafkaConsumer}, or a bounded Parquet + * {@code FileSource} — and knows both how to register itself onto a Flink + * {@link StreamExecutionEnvironment} and whether it is applicable to the current stream + * configuration. {@link DaggerSourceFactory} iterates over the available implementations and + * picks the first one whose {@link #canBuild()} returns {@code true} for the configured + * {@code SOURCE_DETAILS}. + * + * @param the type of records produced by the source; within Dagger this is always + * {@code Row} + */ public interface DaggerSource { + /** + * Builds the underlying Flink source and attaches it to the given execution environment. + * + * @param executionEnvironment the Flink {@link StreamExecutionEnvironment} the source is added to + * @param watermarkStrategy the watermark strategy used to assign event-time timestamps and + * emit watermarks for the produced records + * @return the {@code DataStream} of records emitted by this source + */ DataStream register(StreamExecutionEnvironment executionEnvironment, WatermarkStrategy watermarkStrategy); + /** + * Indicates whether this source can be constructed for the current stream configuration. + * + *

Implementations typically inspect the configured {@code SOURCE_DETAILS} (source name and + * source type) together with the supplied deserializer to decide applicability. It is used by + * {@link DaggerSourceFactory} to select exactly one source per stream. + * + * @return {@code true} if this source matches the configuration and can be built; + * {@code false} otherwise + */ boolean canBuild(); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSourceFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSourceFactory.java index c940b4e71..fcee9432d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSourceFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/DaggerSourceFactory.java @@ -16,8 +16,32 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +/** + * Factory that selects and instantiates the {@link DaggerSource} implementation appropriate for a + * given stream. + * + *

Dagger supports multiple source connectors (Kafka via the modern {@code KafkaSource}, the + * legacy {@code FlinkKafkaConsumer}, and a bounded Parquet {@code FileSource}). This factory + * constructs one candidate of each and returns the first whose {@link DaggerSource#canBuild()} + * reports that it matches the configured {@code SOURCE_DETAILS}. + */ public class DaggerSourceFactory { + /** + * Creates the single {@link DaggerSource} that matches the supplied stream configuration. + * + *

All candidate sources are built and the first one whose {@link DaggerSource#canBuild()} + * returns {@code true} is returned. If none are applicable, an + * {@link InvalidDaggerSourceException} is reported to StatsD and then thrown. + * + * @param streamConfig the per-stream configuration describing the source details + * @param configuration the global Dagger job configuration + * @param deserializer the deserializer that converts raw source records into {@code Row} + * @param statsDReporterSupplier supplier of the StatsD reporter used to report a fatal error + * when no source can be built + * @return the first applicable {@code DaggerSource} of {@code Row} + * @throws InvalidDaggerSourceException if no configured source can handle the {@code SOURCE_DETAILS} + */ public static DaggerSource create(StreamConfig streamConfig, Configuration configuration, DaggerDeserializer deserializer, SerializedStatsDReporterSupplier statsDReporterSupplier) { List> daggerSources = getDaggerSources(streamConfig, configuration, deserializer, statsDReporterSupplier); return daggerSources.stream() @@ -32,6 +56,19 @@ public static DaggerSource create(StreamConfig streamConfig, Configuration }); } + /** + * Builds the ordered list of candidate sources considered by {@link #create}. + * + *

The order is significant: the {@code KafkaSource}-based source is preferred over the + * legacy {@code FlinkKafkaConsumer} source, which in turn precedes the bounded Parquet source. + * Only the first candidate reporting {@link DaggerSource#canBuild()} is ultimately used. + * + * @param streamConfig the per-stream configuration describing the source details + * @param configuration the global Dagger job configuration + * @param deserializer the deserializer shared by all candidate sources + * @param statsDReporterSupplier supplier of the StatsD reporter passed to sources that need it + * @return the list of candidate {@code DaggerSource} instances in selection-priority order + */ private static List> getDaggerSources(StreamConfig streamConfig, Configuration configuration, DaggerDeserializer deserializer, SerializedStatsDReporterSupplier statsDReporterSupplier) { KafkaDaggerSource kafkaDaggerSource = new KafkaDaggerSource(streamConfig, configuration, deserializer); FlinkKafkaConsumerDaggerSource flinkKafkaConsumerDaggerSource = new FlinkKafkaConsumerDaggerSource(streamConfig, configuration, deserializer); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/Stream.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/Stream.java index 78cbd2459..40322602c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/Stream.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/Stream.java @@ -14,27 +14,86 @@ import java.io.Serializable; +/** + * Represents a single configured input stream of a Dagger job, pairing the {@link DaggerSource} + * that produces records with the logical schema/table name those records are registered under. + * + *

One {@code Stream} is created per entry in the {@code STREAMS} configuration. It is + * {@link Serializable} so that it can be captured as part of the Flink job graph. Instances are + * created through the nested {@link Builder}. + */ public class Stream implements Serializable { + /** + * The source responsible for producing this stream's records as Flink {@code Row}s. + */ @Getter private final DaggerSource daggerSource; + /** + * The logical name (schema/table) the stream is registered under; used as the source/table + * name when the stream is added to the Flink execution environment. + */ @Getter private final String streamName; + /** + * Creates a stream binding a source to its registration name. + * + * @param daggerSource the source that produces this stream's records + * @param streamName the schema/table name the stream is registered under + */ Stream(DaggerSource daggerSource, String streamName) { this.daggerSource = daggerSource; this.streamName = streamName; } + /** + * Registers this stream's source onto the given execution environment. + * + *

Delegates to {@link DaggerSource#register(StreamExecutionEnvironment, WatermarkStrategy)}, + * applying the supplied watermark strategy to the produced records. + * + * @param executionEnvironment the Flink execution environment to attach the source to + * @param watermarkStrategy the watermark strategy applied to the emitted records + * @return the {@code DataStream} of {@code Row}s emitted by the underlying source + */ public DataStream registerSource(StreamExecutionEnvironment executionEnvironment, WatermarkStrategy watermarkStrategy) { return daggerSource.register(executionEnvironment, watermarkStrategy); } + /** + * Assembles a fully wired {@link Stream} from a {@link StreamConfig}. + * + *

On {@link #build()} the builder first creates the appropriate deserializer via + * {@link DaggerDeserializerFactory} and then resolves the matching source via + * {@link DaggerSourceFactory}, naming the stream after the configured schema table. + */ public static class Builder { + /** + * The per-stream configuration the stream is built from. + */ private final StreamConfig streamConfig; + /** + * The global Dagger job configuration. + */ private final Configuration configuration; + /** + * The Stencil orchestrator used to resolve Protobuf descriptors for the deserializer. + */ private final StencilClientOrchestrator stencilClientOrchestrator; + /** + * Supplier of the StatsD reporter used for error and metric reporting. + */ private final SerializedStatsDReporterSupplier statsDReporterSupplier; + /** + * Creates a builder holding the inputs required to construct a {@link Stream}. + * + * @param streamConfig the per-stream configuration to build from + * @param configuration the global Dagger job configuration + * @param stencilClientOrchestrator the Stencil orchestrator used to resolve Protobuf + * descriptors for the deserializer + * @param statsDReporterSupplier supplier of the StatsD reporter for error/metric reporting + */ public Builder(StreamConfig streamConfig, Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator, SerializedStatsDReporterSupplier statsDReporterSupplier) { this.streamConfig = streamConfig; this.configuration = configuration; @@ -42,6 +101,12 @@ public Builder(StreamConfig streamConfig, Configuration configuration, StencilCl this.statsDReporterSupplier = statsDReporterSupplier; } + /** + * Creates the deserializer and the matching source and assembles the {@link Stream}. + * + * @return a new {@code Stream} whose source is selected by {@link DaggerSourceFactory} and + * whose name is the configured schema table + */ public Stream build() { DaggerDeserializer daggerDeserializer = DaggerDeserializerFactory.create(streamConfig, configuration, stencilClientOrchestrator, statsDReporterSupplier); DaggerSource daggerSource = DaggerSourceFactory.create(streamConfig, configuration, daggerDeserializer, statsDReporterSupplier); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/StreamsFactory.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/StreamsFactory.java index 9b6b8daf3..c44d6ad0d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/StreamsFactory.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/StreamsFactory.java @@ -8,7 +8,23 @@ import java.util.ArrayList; import java.util.List; +/** + * Factory that materializes all configured input streams of a Dagger job. + * + *

It parses the {@code STREAMS} configuration into one {@link StreamConfig} per stream and uses + * {@link Stream.Builder} to construct a ready-to-register {@link Stream} for each entry. + */ public class StreamsFactory { + /** + * Parses the stream configuration and builds every declared {@link Stream}. + * + * @param configuration the global Dagger job configuration containing the + * {@code STREAMS} definition + * @param stencilClientOrchestrator the Stencil orchestrator used to resolve Protobuf + * descriptors for each stream's deserializer + * @param statsDReporterSupplier supplier of the StatsD reporter used for error/metric reporting + * @return the list of fully built streams, one per entry in the {@code STREAMS} configuration + */ public static List getStreams(Configuration configuration, StencilClientOrchestrator stencilClientOrchestrator, SerializedStatsDReporterSupplier statsDReporterSupplier) { StreamConfig[] streamConfigs = StreamConfig.parse(configuration); ArrayList streams = new ArrayList<>(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfig.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfig.java index 6ade4a723..eb21ddef1 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfig.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfig.java @@ -39,134 +39,241 @@ import static com.gotocompany.dagger.common.core.Constants.STREAM_INPUT_SCHEMA_TABLE; import static com.gotocompany.dagger.core.utils.Constants.*; +/** + * Configuration model describing a single input stream consumed by a Dagger job. + * + *

Dagger reads its input definitions from the {@code INPUT_STREAMS} configuration, whose value is + * a JSON array of stream objects. Each element is deserialized by Gson into a {@code StreamConfig} + * via {@link #parse(Configuration)}, which additionally runs the {@link StreamConfigValidator} + * checks. A single instance captures everything needed to wire up one source: Kafka connection and + * security settings (bootstrap servers, consumer group, SSL/SASL), the input schema (Protobuf class + * or JSON schema and the event-timestamp field), the data type ({@code PROTO} or {@code JSON}), the + * ordered list of backing {@link SourceDetails sources}, and Parquet-specific options (file paths, + * read order, schema-match strategy, and date-range filter). + * + *

Several string fields are validated and normalized during deserialization through Gson + * {@code @JsonAdapter} type adapters in the {@code adapter} package (for example SSL protocol, + * keystore/truststore type, security protocol, and SASL mechanism), so invalid values fail fast at + * job startup. Most getters are generated by Lombok; a handful are hand-written to supply defaults. + */ public class StreamConfig { + /** Shared, pretty-printing Gson instance used to (de)serialize stream configs and Kafka props. */ private static final Gson GSON = new GsonBuilder() .enableComplexMapKeySerialization() .setPrettyPrinting() .create(); + /** Lower-cased key prefix identifying serialized Kafka consumer settings to expose as client props. */ private static final String KAFKA_PREFIX = "source_kafka_consumer_config_"; + /** Password of the private key stored in the Kafka consumer keystore ({@code ssl.key.password}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEY_PASSWORD_KEY) @Getter private String sslKeyPassword; + /** Filesystem path to the SSL keystore presented by the Kafka consumer ({@code ssl.keystore.location}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_LOCATION_KEY) @Getter private String sslKeystoreLocation; + /** Password protecting the Kafka consumer SSL keystore ({@code ssl.keystore.password}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_PASSWORD_KEY) @Getter private String sslKeystorePassword; + /** + * File format of the Kafka consumer SSL keystore ({@code ssl.keystore.type}). + * + *

Validated during deserialization by {@link DaggerSSLKeyStoreFileTypeAdaptor} against the + * supported store types ({@code JKS}, {@code PKCS12}, {@code PEM}). + */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_TYPE_KEY) @Getter @JsonAdapter(value = DaggerSSLKeyStoreFileTypeAdaptor.class) private String sslKeystoreType; + /** + * SSL/TLS protocol version negotiated by the Kafka consumer ({@code ssl.protocol}). + * + *

Validated during deserialization by {@link DaggerSSLProtocolAdaptor} against the supported + * protocol versions. + */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_PROTOCOL_KEY) @Getter @JsonAdapter(value = DaggerSSLProtocolAdaptor.class) private String sslProtocol; + /** Filesystem path to the SSL truststore used by the Kafka consumer ({@code ssl.truststore.location}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_LOCATION_KEY) @Getter private String sslTruststoreLocation; + /** Password protecting the Kafka consumer SSL truststore ({@code ssl.truststore.password}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_PASSWORD_KEY) @Getter private String sslTruststorePassword; + /** + * File format of the Kafka consumer SSL truststore ({@code ssl.truststore.type}). + * + *

Validated during deserialization by {@link DaggerSSLTrustStoreFileTypeAdaptor} against the + * supported store types ({@code JKS}, {@code PKCS12}, {@code PEM}). + */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_TYPE_KEY) @Getter @JsonAdapter(value = DaggerSSLTrustStoreFileTypeAdaptor.class) private String sslTruststoreType; + /** Kafka topic selector for this stream, interpreted as a Java regular expression; see {@link #getTopicPattern()}. */ @SerializedName(STREAM_SOURCE_KAFKA_TOPIC_NAMES_KEY) @Getter private String kafkaTopicNames; + /** Fully-qualified name of the Protobuf message class describing the input schema (used for {@code PROTO} streams). */ @SerializedName(STREAM_INPUT_SCHEMA_PROTO_CLASS) @Getter private String protoClass; + /** Name of the Flink SQL table under which this stream is registered for querying. */ @SerializedName(STREAM_INPUT_SCHEMA_TABLE) @Getter private String schemaTable; + /** Whether the Kafka consumer commits offsets automatically ({@code enable.auto.commit}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_AUTO_COMMIT_ENABLE_KEY) @Getter private String autoCommitEnable; + /** + * Kafka offset-reset policy applied when no committed offset exists for the consumer group + * ({@code auto.offset.reset}), for example {@code latest} or {@code earliest}. + * + *

Defaults to {@code latest} when unset via {@link #getAutoOffsetReset()}. + */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_AUTO_OFFSET_RESET_KEY) private String autoOffsetReset; + /** Kafka consumer group id used when subscribing to the configured topics ({@code group.id}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_GROUP_ID_KEY) @Getter private String consumerGroupId; + /** Comma-separated list of Kafka broker {@code host:port} pairs ({@code bootstrap.servers}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_BOOTSTRAP_SERVERS_KEY) @Getter private String bootstrapServers; + /** + * Security protocol used to connect to Kafka ({@code security.protocol}). + * + *

Validated during deserialization by {@link DaggerSecurityProtocolAdaptor} against the + * supported protocols ({@code SASL_PLAINTEXT}, {@code SASL_SSL}, {@code SSL}). + */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SECURITY_PROTOCOL_KEY) @Getter @JsonAdapter(value = DaggerSecurityProtocolAdaptor.class) private String securityProtocol; + /** + * SASL mechanism used to authenticate the Kafka consumer ({@code sasl.mechanism}). + * + *

Validated during deserialization by {@link DaggerSASLMechanismAdaptor} against the supported + * mechanisms ({@code PLAIN}, {@code SCRAM-SHA-256}, {@code SCRAM-SHA-512}). + */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SASL_MECHANISM_KEY) @JsonAdapter(value = DaggerSASLMechanismAdaptor.class) @Getter private String saslMechanism; + /** JAAS login configuration string supplying SASL credentials ({@code sasl.jaas.config}). */ @SerializedName(SOURCE_KAFKA_CONSUMER_CONFIG_SASL_JAAS_CONFIG_KEY) @Getter private String saslJaasConfig; + /** Logical name of this input stream as referenced elsewhere in the Dagger job configuration. */ @SerializedName(STREAM_INPUT_STREAM_NAME_KEY) @Getter private String kafkaName; + /** JSON schema describing the input records when the stream's data type is {@code JSON}. */ @SerializedName(STREAM_INPUT_SCHEMA_JSON_SCHEMA_KEY) @Getter private String jsonSchema; + /** Name of the JSON field carrying the event timestamp used for watermarking {@code JSON} streams. */ @SerializedName(STREAM_INPUT_SCHEMA_JSON_EVENT_TIMESTAMP_FIELD_NAME_KEY) @Getter private String jsonEventTimestampFieldName; + /** Index (within the configured schema) of the field carrying the event timestamp. */ @SerializedName(STREAM_INPUT_SCHEMA_EVENT_TIMESTAMP_FIELD_INDEX_KEY) @Getter private String eventTimestampFieldIndex; + /** Input record encoding for this stream, either {@code PROTO} or {@code JSON}; defaults to {@code PROTO} via {@link #getDataType()}. */ @SerializedName(STREAM_INPUT_DATATYPE) private String dataType; + /** + * Ordered list of backing sources for this stream, each pairing a {@link SourceName} with a + * {@link SourceType} (for example a bounded Parquet backfill followed by an unbounded Kafka feed). + * + *

Defaults to a single unbounded Kafka consumer source when unset via {@link #getSourceDetails()}. + */ @SerializedName(STREAM_SOURCE_DETAILS_KEY) private SourceDetails[] sourceDetails; + /** + * Parquet file or directory paths to read when a Parquet source is configured. + * + *

Each entry is trimmed of surrounding whitespace during deserialization by + * {@link SourceParquetFilePathsAdapter}. + */ @SerializedName(STREAM_SOURCE_PARQUET_FILE_PATHS_KEY) @Getter @JsonAdapter(value = SourceParquetFilePathsAdapter.class) private String[] parquetFilePaths; + /** Order in which Parquet files are read; defaults to {@code EARLIEST_TIME_URL_FIRST} via {@link #getParquetFilesReadOrderStrategy()}. */ @SerializedName(STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_KEY) private SourceParquetReadOrderStrategy parquetFilesReadOrderStrategy; + /** Strategy used to reconcile each Parquet file's schema with the configured schema while reading. */ @SerializedName(STREAM_SOURCE_PARQUET_SCHEMA_MATCH_STRATEGY_KEY) @Getter private SourceParquetSchemaMatchStrategy parquetSchemaMatchStrategy; + /** + * Additional Kafka consumer properties merged into the generated client configuration. + * + *

During deserialization {@link DaggerKafkaConsumerAdditionalConfigurationsAdaptor} validates + * that every key matches the consumer prefix and drops entries with {@code null} values. + */ @SerializedName(SOURCE_KAFKA_CONSUMER_ADDITIONAL_CONFIGURATIONS) @JsonAdapter(value = DaggerKafkaConsumerAdditionalConfigurationsAdaptor.class) @Getter private Map additionalConsumerConfigurations; + /** + * Time window(s) used to restrict which Parquet files are read, based on the timestamps in their + * paths. + * + *

Parsed from a semicolon-separated list of ISO-8601 timestamp pairs by + * {@link FileDateRangeAdaptor} into a {@link TimeRangePool}. + */ @SerializedName(STREAM_SOURCE_PARQUET_FILE_DATE_RANGE_KEY) @JsonAdapter(FileDateRangeAdaptor.class) @Getter private TimeRangePool parquetFileDateRange; + /** + * Returns the input record encoding for this stream, defaulting to {@code PROTO} when unspecified. + * + *

The first call lazily caches the {@code PROTO} default back onto the field so subsequent + * serialization reflects the resolved value. + * + * @return the configured data type, or {@code "PROTO"} when none was provided + */ public String getDataType() { if (dataType == null) { dataType = "PROTO"; @@ -174,6 +281,15 @@ public String getDataType() { return dataType; } + /** + * Returns the backing sources for this stream, defaulting to a single unbounded Kafka consumer. + * + *

When no {@code sourceDetails} were configured, a one-element array consisting of a + * {@link SourceName#KAFKA_CONSUMER} source with {@link SourceType#UNBOUNDED} is returned so legacy + * Kafka-only streams keep working without explicit source declarations. + * + * @return the configured source details, or the default Kafka-consumer source when none were set + */ public SourceDetails[] getSourceDetails() { if (sourceDetails == null) { return new SourceDetails[]{new SourceDetails(SourceName.KAFKA_CONSUMER, SourceType.UNBOUNDED)}; @@ -182,6 +298,12 @@ public SourceDetails[] getSourceDetails() { } } + /** + * Returns the Parquet read-order strategy, defaulting to earliest-timestamp-first when unset. + * + * @return the configured {@link SourceParquetReadOrderStrategy}, or + * {@link SourceParquetReadOrderStrategy#EARLIEST_TIME_URL_FIRST} when none was provided + */ public SourceParquetReadOrderStrategy getParquetFilesReadOrderStrategy() { if (parquetFilesReadOrderStrategy == null) { return SourceParquetReadOrderStrategy.EARLIEST_TIME_URL_FIRST; @@ -190,6 +312,13 @@ public SourceParquetReadOrderStrategy getParquetFilesReadOrderStrategy() { } } + /** + * Returns the Kafka {@code auto.offset.reset} policy, defaulting to {@code latest} when unspecified. + * + *

The first call lazily caches the {@code latest} default back onto the field. + * + * @return the configured offset-reset policy, or {@code "latest"} when none was provided + */ public String getAutoOffsetReset() { if (autoOffsetReset == null) { autoOffsetReset = "latest"; @@ -197,6 +326,18 @@ public String getAutoOffsetReset() { return autoOffsetReset; } + /** + * Parses the {@code INPUT_STREAMS} configuration value into an array of validated stream configs. + * + *

The configuration holds a JSON array of stream objects (empty string when absent). Each + * element is deserialized with Gson and then passed through + * {@link StreamConfigValidator#validateSourceDetails(StreamConfig)} and + * {@link StreamConfigValidator#validateParquetDataSourceStreamConfigs(StreamConfig)}, so malformed + * source declarations cause the job to fail fast during startup. + * + * @param configuration the Dagger job configuration to read the {@code INPUT_STREAMS} value from + * @return one validated {@code StreamConfig} per element of the configured JSON array + */ public static StreamConfig[] parse(Configuration configuration) { String jsonArrayString = configuration.getString(INPUT_STREAMS, ""); JsonReader reader = new JsonReader(new StringReader(jsonArrayString)); @@ -208,6 +349,18 @@ public static StreamConfig[] parse(Configuration configuration) { .toArray(StreamConfig[]::new); } + /** + * Builds the Kafka client {@link Properties} for this stream's consumer. + * + *

This config is round-tripped through JSON to a flat map; entries whose key (lower-cased) + * starts with the {@code source_kafka_consumer_config_} prefix are converted into dotted Kafka + * property names (see {@link #parseVarName(String, String)}) and copied into the result. Finally + * {@link #setAdditionalKafkaConsumerConfigs(Properties, Configuration)} applies large-message + * tuning and any user-supplied extra consumer properties. + * + * @param configuration the Dagger job configuration, consulted for large-message handling + * @return the assembled Kafka consumer properties for this stream + */ public Properties getKafkaProps(Configuration configuration) { String jsonString = GSON.toJson(this); Map streamConfigMap = GSON.fromJson(jsonString, Map.class); @@ -220,11 +373,34 @@ public Properties getKafkaProps(Configuration configuration) { return kafkaProps; } + /** + * Converts a serialized consumer-config key into its dotted Kafka property name. + * + *

The key is lower-cased, the {@code source_kafka_consumer_config_} prefix is stripped, and the + * remaining underscore-delimited segments are re-joined with dots. For example + * {@code SOURCE_KAFKA_CONSUMER_CONFIG_BOOTSTRAP_SERVERS} becomes {@code bootstrap.servers}. + * + * @param varName the serialized configuration key to translate + * @param kafkaPrefix the prefix to remove before splitting (the consumer-config prefix) + * @return the equivalent dot-separated Kafka client property name + */ private String parseVarName(String varName, String kafkaPrefix) { String[] names = varName.toLowerCase().replaceAll(kafkaPrefix, "").split("_"); return String.join(".", names); } + /** + * Augments the given Kafka properties with large-message tuning and user-supplied extra configs. + * + *

When large-message consumption is enabled in the job configuration, the consumer's + * {@code max.partition.fetch.bytes} is raised to the configured default. Any + * {@link #additionalConsumerConfigurations} present are normalized via + * {@link KafkaConfigUtil#parseKafkaConfiguration} for the {@link KafkaConnectorTypesMetadata#SOURCE} + * connector and merged in, overriding earlier values on key collision. + * + * @param kafkaProps the properties to mutate in place with the additional settings + * @param configuration the Dagger job configuration consulted for large-message handling + */ private void setAdditionalKafkaConsumerConfigs(Properties kafkaProps, Configuration configuration) { if (configuration.getBoolean(SOURCE_KAFKA_CONSUME_LARGE_MESSAGE_ENABLE_KEY, SOURCE_KAFKA_CONSUME_LARGE_MESSAGE_ENABLE_DEFAULT)) { kafkaProps.setProperty(SOURCE_KAFKA_MAX_PARTITION_FETCH_BYTES_KEY, SOURCE_KAFKA_MAX_PARTITION_FETCH_BYTES_DEFAULT); @@ -236,14 +412,38 @@ private void setAdditionalKafkaConsumerConfigs(Properties kafkaProps, Configurat } } + /** + * Compiles the configured topic selector into a regular-expression {@link Pattern}. + * + *

Dagger subscribes to Kafka topics by pattern, so the raw {@code kafkaTopicNames} value is + * treated as a regex rather than a literal topic list. + * + * @return a compiled pattern matching the topics this stream should consume + */ public Pattern getTopicPattern() { return Pattern.compile(kafkaTopicNames); } + /** + * Builds the Flink Kafka source starting offset from the committed consumer-group offsets. + * + *

The returned {@link OffsetsInitializer} resumes from the group's committed offsets and falls + * back to the configured {@code auto.offset.reset} strategy when no committed offset is available. + * + * @return an offsets initializer seeded with committed offsets and the configured reset strategy + */ public OffsetsInitializer getStartingOffset() { return OffsetsInitializer.committedOffsets(getOffsetResetStrategy()); } + /** + * Maps the configured {@code auto.offset.reset} value onto a Kafka {@link OffsetResetStrategy}. + * + *

The configured value is upper-cased to match the enum constant names + * (for example {@code latest} maps to {@link OffsetResetStrategy#LATEST}). + * + * @return the offset-reset strategy corresponding to the configured value + */ private OffsetResetStrategy getOffsetResetStrategy() { return OffsetResetStrategy.valueOf(autoOffsetReset.toUpperCase()); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfigValidator.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfigValidator.java index d48d9ad97..a31ae1bdb 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfigValidator.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/StreamConfigValidator.java @@ -10,7 +10,29 @@ import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_DETAILS_KEY; import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_PARQUET_FILE_PATHS_KEY; +/** + * Validation helpers applied to {@link StreamConfig} instances right after they are parsed. + * + *

{@link StreamConfig#parse} chains these static methods so that structurally invalid stream + * definitions abort job startup with a descriptive message instead of failing later. Each method + * returns the same config it received, which lets them be composed in a {@link java.util.stream.Stream} + * pipeline. The checks are implemented with Guava {@link com.google.common.base.Preconditions}, so a + * failed assertion throws {@link IllegalArgumentException}. + */ public class StreamConfigValidator { + /** + * Verifies that a stream declares at least one fully-specified source. + * + *

The {@code sourceDetails} array must be non-empty, contain no {@code null} elements, and each + * element must have both a non-null {@link com.gotocompany.dagger.core.source.config.models.SourceName} + * and {@link com.gotocompany.dagger.core.source.config.models.SourceType} (an unknown or + * whitespace-padded value deserializes to {@code null} and is rejected here). + * + * @param streamConfig the parsed stream configuration to validate + * @return the same {@code streamConfig}, unchanged, when all source details are valid + * @throws IllegalArgumentException if the source details are empty, contain a {@code null} entry, + * or have a missing source name or source type + */ public static StreamConfig validateSourceDetails(StreamConfig streamConfig) { SourceDetails[] sourceDetailsArray = streamConfig.getSourceDetails(); Preconditions.checkArgument(sourceDetailsArray.length != 0, "%s config is set to " @@ -29,6 +51,17 @@ public static StreamConfig validateSourceDetails(StreamConfig streamConfig) { return streamConfig; } + /** + * Runs Parquet-specific validation when the stream uses a Parquet source. + * + *

If any configured source is {@link com.gotocompany.dagger.core.source.config.models.SourceName#PARQUET_SOURCE}, + * the config is forwarded to {@link #validateParquetFilePaths(StreamConfig)} to ensure the file + * paths are usable; otherwise it is returned untouched. + * + * @param streamConfig the parsed stream configuration to inspect + * @return the same {@code streamConfig}, after Parquet path validation when applicable + * @throws IllegalArgumentException if a Parquet source is present but its file paths are invalid + */ public static StreamConfig validateParquetDataSourceStreamConfigs(StreamConfig streamConfig) { SourceDetails[] sourceDetailsArray = streamConfig.getSourceDetails(); for (SourceDetails sourceDetails : sourceDetailsArray) { @@ -42,6 +75,17 @@ public static StreamConfig validateParquetDataSourceStreamConfigs(StreamConfig s return streamConfig; } + /** + * Ensures the Parquet file paths required by a Parquet source are present and non-null. + * + *

The {@code parquetFilePaths} array must not be {@code null}, and no individual entry may be + * the literal string {@code "null"} (which is how an absent path round-trips through the JSON + * configuration). + * + * @param streamConfig the parsed stream configuration whose Parquet paths are validated + * @return the same {@code streamConfig}, unchanged, when the file paths are valid + * @throws IllegalArgumentException if the paths array is {@code null} or any entry is {@code "null"} + */ private static StreamConfig validateParquetFilePaths(StreamConfig streamConfig) { String[] parquetFilePaths = streamConfig.getParquetFilePaths(); Preconditions.checkArgument(parquetFilePaths != null, "%s is required for configuring a " diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerKafkaConsumerAdditionalConfigurationsAdaptor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerKafkaConsumerAdditionalConfigurationsAdaptor.java index 15b7f83e9..c7623c9b9 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerKafkaConsumerAdditionalConfigurationsAdaptor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerKafkaConsumerAdditionalConfigurationsAdaptor.java @@ -11,14 +11,46 @@ import java.util.Map; import java.util.stream.Collectors; +/** + * Gson {@link TypeAdapter} that reads and validates arbitrary extra Kafka consumer properties while a + * {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code additionalConsumerConfigurations} field, letting operators pass through Kafka consumer + * settings that are not modelled as dedicated fields. Every key must match the consumer-config naming + * pattern of {@link KafkaConnectorTypesMetadata#SOURCE}, and entries whose value is {@code null} are + * dropped. + */ public class DaggerKafkaConsumerAdditionalConfigurationsAdaptor extends TypeAdapter> { + /** + * Serializes the additional consumer configuration map to JSON. + * + *

The map is converted to its JSON object form with a plain {@link Gson} instance and written + * verbatim to the stream. + * + * @param jsonWriter the writer receiving the serialized map + * @param stringStringMap the additional consumer properties to serialize + * @throws IOException if writing to the underlying JSON stream fails + */ @Override public void write(JsonWriter jsonWriter, Map stringStringMap) throws IOException { Gson gson = new Gson(); jsonWriter.jsonValue(gson.toJson(stringStringMap)); } + /** + * Reads the additional consumer configuration object, validating keys and discarding null values. + * + *

Each key is matched against the {@link KafkaConnectorTypesMetadata#SOURCE} configuration + * pattern; if any key fails to match, the entire configuration is rejected. The surviving entries + * whose values are non-{@code null} are returned as the parsed map. + * + * @param jsonReader the reader positioned at the JSON object of additional properties + * @return a map of the valid, non-{@code null} additional consumer properties + * @throws IOException if reading from the underlying JSON stream fails + * @throws IllegalArgumentException if any key does not match the consumer configuration pattern + */ @Override public Map read(JsonReader jsonReader) throws IOException { Gson gson = new Gson(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSASLMechanismAdaptor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSASLMechanismAdaptor.java index 97e6edd11..c99ee421d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSASLMechanismAdaptor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSASLMechanismAdaptor.java @@ -9,7 +9,24 @@ import java.io.IOException; import java.util.Arrays; +/** + * Gson {@link TypeAdapter} that validates the Kafka consumer SASL mechanism while a + * {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code saslMechanism} field and accepts only the mechanisms listed in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SASL_MECHANISM} ({@code PLAIN}, + * {@code SCRAM-SHA-256} or {@code SCRAM-SHA-512}); any other value aborts job startup with an + * {@link InvalidConfigurationException}. + */ public class DaggerSASLMechanismAdaptor extends TypeAdapter { + /** + * Serializes the SASL mechanism value back to JSON, emitting a JSON {@code null} when it is unset. + * + * @param jsonWriter the writer receiving the serialized value + * @param value the SASL mechanism to write; a {@code null} is rendered as a JSON null literal + * @throws IOException if writing to the underlying JSON stream fails + */ @Override public void write(JsonWriter jsonWriter, String value) throws IOException { if (value == null) { @@ -19,6 +36,15 @@ public void write(JsonWriter jsonWriter, String value) throws IOException { jsonWriter.value(value); } + /** + * Reads the SASL mechanism value from JSON and validates it against the supported mechanisms. + * + * @param jsonReader the reader positioned at the SASL mechanism string + * @return the SASL mechanism when it is one of the supported values + * @throws IOException if reading from the underlying JSON stream fails + * @throws InvalidConfigurationException if the value is not present in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SASL_MECHANISM} + */ @Override public String read(JsonReader jsonReader) throws IOException { String saslMechanism = jsonReader.nextString(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLKeyStoreFileTypeAdaptor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLKeyStoreFileTypeAdaptor.java index 4d227a656..ce3427ece 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLKeyStoreFileTypeAdaptor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLKeyStoreFileTypeAdaptor.java @@ -9,7 +9,24 @@ import java.io.IOException; import java.util.Arrays; +/** + * Gson {@link TypeAdapter} that validates the Kafka consumer SSL keystore file type while a + * {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code sslKeystoreType} field and accepts only the store formats listed in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_STORE_FILE_TYPE} ({@code JKS}, + * {@code PKCS12} or {@code PEM}); any other value aborts job startup with an + * {@link InvalidConfigurationException}. + */ public class DaggerSSLKeyStoreFileTypeAdaptor extends TypeAdapter { + /** + * Serializes the keystore type value back to JSON, emitting a JSON {@code null} when it is unset. + * + * @param jsonWriter the writer receiving the serialized value + * @param value the keystore type to write; a {@code null} is rendered as a JSON null literal + * @throws IOException if writing to the underlying JSON stream fails + */ @Override public void write(JsonWriter jsonWriter, String value) throws IOException { if (value == null) { @@ -19,6 +36,15 @@ public void write(JsonWriter jsonWriter, String value) throws IOException { jsonWriter.value(value); } + /** + * Reads the keystore type value from JSON and validates it against the supported store formats. + * + * @param jsonReader the reader positioned at the keystore type string + * @return the keystore type when it is one of the supported values + * @throws IOException if reading from the underlying JSON stream fails + * @throws InvalidConfigurationException if the value is not present in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_STORE_FILE_TYPE} + */ @Override public String read(JsonReader jsonReader) throws IOException { String keyStoreFileType = jsonReader.nextString(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLProtocolAdaptor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLProtocolAdaptor.java index 737e4e94a..c7d3d6e96 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLProtocolAdaptor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLProtocolAdaptor.java @@ -9,7 +9,24 @@ import java.io.IOException; import java.util.Arrays; +/** + * Gson {@link TypeAdapter} that validates the Kafka consumer SSL/TLS protocol while a + * {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code sslProtocol} field and accepts only the protocol versions listed in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_PROTOCOL} (for example {@code TLSv1.2}, + * {@code TLSv1.3} or {@code SSL}); any other value aborts job startup with an + * {@link InvalidConfigurationException}. + */ public class DaggerSSLProtocolAdaptor extends TypeAdapter { + /** + * Serializes the SSL protocol value back to JSON, emitting a JSON {@code null} when it is unset. + * + * @param jsonWriter the writer receiving the serialized value + * @param value the SSL protocol to write; a {@code null} is rendered as a JSON null literal + * @throws IOException if writing to the underlying JSON stream fails + */ @Override public void write(JsonWriter jsonWriter, String value) throws IOException { if (value == null) { @@ -19,6 +36,15 @@ public void write(JsonWriter jsonWriter, String value) throws IOException { jsonWriter.value(value); } + /** + * Reads the SSL protocol value from JSON and validates it against the supported versions. + * + * @param jsonReader the reader positioned at the SSL protocol string + * @return the SSL protocol when it is one of the supported values + * @throws IOException if reading from the underlying JSON stream fails + * @throws InvalidConfigurationException if the value is not present in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_PROTOCOL} + */ @Override public String read(JsonReader jsonReader) throws IOException { String sslProtocol = jsonReader.nextString(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLTrustStoreFileTypeAdaptor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLTrustStoreFileTypeAdaptor.java index ac1ef004f..3d76d0bff 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLTrustStoreFileTypeAdaptor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSSLTrustStoreFileTypeAdaptor.java @@ -9,7 +9,24 @@ import java.io.IOException; import java.util.Arrays; +/** + * Gson {@link TypeAdapter} that validates the Kafka consumer SSL truststore file type while a + * {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code sslTruststoreType} field and accepts only the store formats listed in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_STORE_FILE_TYPE} ({@code JKS}, + * {@code PKCS12} or {@code PEM}); any other value aborts job startup with an + * {@link InvalidConfigurationException}. + */ public class DaggerSSLTrustStoreFileTypeAdaptor extends TypeAdapter { + /** + * Serializes the truststore type value back to JSON, emitting a JSON {@code null} when it is unset. + * + * @param jsonWriter the writer receiving the serialized value + * @param value the truststore type to write; a {@code null} is rendered as a JSON null literal + * @throws IOException if writing to the underlying JSON stream fails + */ @Override public void write(JsonWriter jsonWriter, String value) throws IOException { if (value == null) { @@ -19,6 +36,15 @@ public void write(JsonWriter jsonWriter, String value) throws IOException { jsonWriter.value(value); } + /** + * Reads the truststore type value from JSON and validates it against the supported store formats. + * + * @param jsonReader the reader positioned at the truststore type string + * @return the truststore type when it is one of the supported values + * @throws IOException if reading from the underlying JSON stream fails + * @throws InvalidConfigurationException if the value is not present in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_STORE_FILE_TYPE} + */ @Override public String read(JsonReader jsonReader) throws IOException { String trustStoreFileType = jsonReader.nextString(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSecurityProtocolAdaptor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSecurityProtocolAdaptor.java index 7f0351a41..3e2b99552 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSecurityProtocolAdaptor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/DaggerSecurityProtocolAdaptor.java @@ -9,7 +9,24 @@ import java.io.IOException; import java.util.Arrays; +/** + * Gson {@link TypeAdapter} that validates the Kafka consumer security protocol while a + * {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code securityProtocol} field and accepts only the protocols listed in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SECURITY_PROTOCOL} ({@code SASL_PLAINTEXT}, + * {@code SASL_SSL} or {@code SSL}); any other value aborts job startup with an + * {@link InvalidConfigurationException}. + */ public class DaggerSecurityProtocolAdaptor extends TypeAdapter { + /** + * Serializes the security protocol value back to JSON, emitting a JSON {@code null} when it is unset. + * + * @param jsonWriter the writer receiving the serialized value + * @param value the security protocol to write; a {@code null} is rendered as a JSON null literal + * @throws IOException if writing to the underlying JSON stream fails + */ @Override public void write(JsonWriter jsonWriter, String value) throws IOException { if (value == null) { @@ -19,6 +36,15 @@ public void write(JsonWriter jsonWriter, String value) throws IOException { jsonWriter.value(value); } + /** + * Reads the security protocol value from JSON and validates it against the supported protocols. + * + * @param jsonReader the reader positioned at the security protocol string + * @return the security protocol when it is one of the supported values + * @throws IOException if reading from the underlying JSON stream fails + * @throws InvalidConfigurationException if the value is not present in + * {@link Constants#SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SECURITY_PROTOCOL} + */ @Override public String read(JsonReader jsonReader) throws IOException { String securityProtocol = jsonReader.nextString(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/FileDateRangeAdaptor.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/FileDateRangeAdaptor.java index 147dd2624..cca8ceb6c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/FileDateRangeAdaptor.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/FileDateRangeAdaptor.java @@ -15,12 +15,43 @@ import java.util.Date; import java.util.TimeZone; +/** + * Gson {@link TypeAdapter} that parses a Parquet file date-range expression into a + * {@link TimeRangePool} while a {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code parquetFileDateRange} field. The configured value is a single string holding one or more + * comma-separated start/end timestamp pairs, with multiple ranges separated by {@code ;}. Each + * timestamp may be either UTC ({@code yyyy-MM-dd'T'HH:mm:ss'Z'}) or local + * ({@code yyyy-MM-dd'T'HH:mm:ss}, also interpreted as UTC). Malformed input, or a start that is after + * its end, raises an {@link InvalidTimeRangeException}. Serialization is a no-op since the range is + * read-only configuration. + */ public class FileDateRangeAdaptor extends TypeAdapter { + /** + * No-op serializer; the date range is read-only configuration and is never written back to JSON. + * + * @param out the writer (unused) + * @param value the time-range pool (unused) + */ @Override public void write(JsonWriter out, TimeRangePool value) { } + /** + * Parses the date-range expression into a {@link TimeRangePool}. + * + *

The value is split on {@code ;} into individual ranges, and each range is split on {@code ,} + * into a start and end timestamp. Both timestamps are parsed via {@link #parseInstant(String)} and + * added as a {@link TimeRange}; the start must not be after the end. + * + * @param reader the reader positioned at the date-range string + * @return a pool containing one {@link TimeRange} per configured timestamp pair + * @throws IOException if reading from the underlying JSON stream fails + * @throws InvalidTimeRangeException if a range does not contain exactly two timestamps, or its + * start is after its end + */ @Override public TimeRangePool read(JsonReader reader) throws IOException { TimeRangePool timeRangePool = new TimeRangePool(); @@ -42,6 +73,17 @@ public TimeRangePool read(JsonReader reader) throws IOException { return timeRangePool; } + /** + * Parses a single timestamp string into an {@link Instant} in the UTC time zone. + * + *

Two layouts are accepted and distinguished purely by their character length: the UTC form + * {@code yyyy-MM-dd'T'HH:mm:ss'Z'} and the local form {@code yyyy-MM-dd'T'HH:mm:ss}; both are + * interpreted as UTC. + * + * @param timestamp the timestamp text to parse + * @return the parsed instant + * @throws InvalidTimeRangeException if the text matches neither supported layout or cannot be parsed + */ private Instant parseInstant(String timestamp) { String utcDateFormatPattern = "yyyy-MM-dd'T'HH:mm:ss'Z'"; SimpleDateFormat utcDateFormat = new SimpleDateFormat(utcDateFormatPattern); @@ -60,6 +102,14 @@ private Instant parseInstant(String timestamp) { throw new InvalidTimeRangeException(String.format("Unable to parse timestamp: %s with supported date formats i.e. yyyy-MM-ddTHH:mm:ssZ and yyyy-MM-ddTHH:mm:ss", timestamp)); } + /** + * Parses a timestamp with the supplied formatter, translating parse failures into a domain error. + * + * @param timestamp the timestamp text to parse + * @param simpleDateFormat the formatter (already configured for the UTC time zone) to apply + * @return the parsed {@link Date} + * @throws InvalidTimeRangeException if the text cannot be parsed by the given formatter + */ private Date parse(String timestamp, SimpleDateFormat simpleDateFormat) { try { return simpleDateFormat.parse(timestamp); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/SourceParquetFilePathsAdapter.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/SourceParquetFilePathsAdapter.java index ecd24cc47..b14cee8a0 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/SourceParquetFilePathsAdapter.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/adapter/SourceParquetFilePathsAdapter.java @@ -8,11 +8,35 @@ import java.io.IOException; import java.util.Arrays; +/** + * Gson {@link TypeAdapter} that reads the configured Parquet source file paths into a trimmed + * {@code String[]} while a {@code StreamConfig} is deserialized. + * + *

It is wired in through a {@code @JsonAdapter} annotation on the stream config's + * {@code parquetFilePaths} field. Reading delegates to a plain {@link Gson} instance to parse the JSON + * array and then trims surrounding whitespace from every entry, so stray spaces in the configuration do + * not corrupt the file URLs. Serialization is intentionally a no-op because these paths are never + * written back out. + */ public class SourceParquetFilePathsAdapter extends TypeAdapter { + /** + * No-op serializer; Parquet file paths are read-only configuration and are never written to JSON. + * + * @param jsonWriter the writer (unused) + * @param strings the file paths (unused) + */ @Override public void write(JsonWriter jsonWriter, String[] strings) { } + /** + * Reads the JSON array of Parquet file paths and returns them trimmed of surrounding whitespace. + * + * @param jsonReader the reader positioned at the JSON array of file paths + * @return a new {@code String[]} containing each configured path with leading and trailing + * whitespace removed + * @throws IOException if reading from the underlying JSON stream fails + */ @Override public String[] read(JsonReader jsonReader) throws IOException { Gson gson = new Gson(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceDetails.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceDetails.java index 668c2aa3e..3ee736051 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceDetails.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceDetails.java @@ -8,15 +8,32 @@ import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_DETAILS_SOURCE_NAME_KEY; import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_DETAILS_SOURCE_TYPE_KEY; +/** + * Immutable pairing of a {@link SourceName} and {@link SourceType} describing one backing source. + * + *

A {@code StreamConfig} carries an array of these, defining the ordered chain of sources a stream + * is read from (for example a {@link SourceType#BOUNDED} Parquet backfill followed by an + * {@link SourceType#UNBOUNDED} Kafka feed). Both properties are populated by Gson from the + * {@code SOURCE_DETAILS} configuration and are checked by {@code StreamConfigValidator}. The type is + * {@link Serializable} so it can be shipped with Flink operator state. + */ public class SourceDetails implements Serializable { + /** The source connector to use, deserialized from the {@code SOURCE_NAME} configuration key. */ @SerializedName(STREAM_SOURCE_DETAILS_SOURCE_NAME_KEY) @Getter private SourceName sourceName; + /** Whether the source is bounded or unbounded, deserialized from the {@code SOURCE_TYPE} configuration key. */ @SerializedName(STREAM_SOURCE_DETAILS_SOURCE_TYPE_KEY) @Getter private SourceType sourceType; + /** + * Creates a source descriptor pairing a connector name with its boundedness. + * + * @param sourceName the source connector to read from + * @param sourceType whether that source is bounded or unbounded + */ public SourceDetails(SourceName sourceName, SourceType sourceType) { this.sourceName = sourceName; this.sourceType = sourceType; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceName.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceName.java index f70658c55..146969e7a 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceName.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceName.java @@ -6,11 +6,21 @@ import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_DETAILS_SOURCE_NAME_KAFKA; import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_DETAILS_SOURCE_NAME_PARQUET; +/** + * Names the kind of backing source a Dagger stream reads from, as declared in the stream config. + * + *

Each constant carries the {@code @SerializedName} alias used in the {@code SOURCE_DETAILS} JSON of + * a {@code StreamConfig}, so a job selects a source by its configured string. The chosen name + * determines which Dagger source implementation is instantiated for the stream. + */ public enum SourceName { + /** Flink FLIP-27 {@code KafkaSource}-based reader; serialized as {@code "KAFKA_SOURCE"}. */ @SerializedName(STREAM_SOURCE_DETAILS_SOURCE_NAME_KAFKA) KAFKA_SOURCE, + /** Bounded Parquet file reader used for backfills from object storage; serialized as {@code "PARQUET_SOURCE"}. */ @SerializedName(STREAM_SOURCE_DETAILS_SOURCE_NAME_PARQUET) PARQUET_SOURCE, + /** Legacy {@code FlinkKafkaConsumer}-based Kafka reader retained for compatibility; serialized as {@code "KAFKA_CONSUMER"}. */ @SerializedName(STREAM_SOURCE_DETAILS_SOURCE_NAME_KAFKA_CONSUMER) KAFKA_CONSUMER } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceType.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceType.java index 749b7689f..421e210b6 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceType.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/SourceType.java @@ -5,9 +5,19 @@ import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_DETAILS_SOURCE_TYPE_BOUNDED; import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_DETAILS_SOURCE_TYPE_UNBOUNDED; +/** + * Declares whether a stream's source is bounded (finite) or unbounded (continuous). + * + *

Each constant carries the {@code @SerializedName} alias used in the {@code SOURCE_DETAILS} JSON of + * a {@code StreamConfig}. The boundedness maps onto Flink's batch-versus-streaming execution: a bounded + * source (such as a Parquet backfill) terminates once exhausted, while an unbounded source (such as + * Kafka) runs indefinitely. + */ public enum SourceType { + /** Finite source that completes once all records are read, e.g. a Parquet backfill; serialized as {@code "BOUNDED"}. */ @SerializedName(STREAM_SOURCE_DETAILS_SOURCE_TYPE_BOUNDED) BOUNDED, + /** Continuous source that never completes on its own, e.g. Kafka; serialized as {@code "UNBOUNDED"}. */ @SerializedName(STREAM_SOURCE_DETAILS_SOURCE_TYPE_UNBOUNDED) UNBOUNDED } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRange.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRange.java index fdf7977ae..08980669f 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRange.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRange.java @@ -5,17 +5,39 @@ import java.io.Serializable; import java.time.Instant; +/** + * Immutable, serializable closed time window bounded by an inclusive start and end {@link Instant}. + * + *

Time ranges are aggregated in a {@link TimeRangePool} to express which Parquet files a Dagger + * stream should ingest, based on the timestamps encoded in their paths. The class-level + * {@code @Getter} exposes Lombok-generated accessors for both bounds, and implementing + * {@link Serializable} lets the range travel with Flink source state. + */ @Getter public class TimeRange implements Serializable { + /** + * Creates a time range with the given inclusive start and end bounds. + * + * @param startInstant the start of the window (inclusive) + * @param endInstant the end of the window (inclusive) + */ public TimeRange(Instant startInstant, Instant endInstant) { this.startInstant = startInstant; this.endInstant = endInstant; } + /** Inclusive start of the window. */ private Instant startInstant; + /** Inclusive end of the window. */ private Instant endInstant; + /** + * Reports whether the given instant lies within this window, inclusive of both bounds. + * + * @param instant the timestamp to test + * @return {@code true} when {@code instant} equals either bound or lies strictly between them + */ public boolean contains(Instant instant) { return instant.equals(startInstant) || instant.equals(endInstant) || (instant.isAfter(startInstant) && instant.isBefore(endInstant)); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRangePool.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRangePool.java index d291d5ee1..89e3057e6 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRangePool.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/config/models/TimeRangePool.java @@ -7,18 +7,42 @@ import java.util.ArrayList; import java.util.List; +/** + * Serializable, ordered collection of {@link TimeRange} windows used to scope Parquet ingestion. + * + *

When a Dagger stream reads from a Parquet source, its configured date range is deserialized (via + * {@code FileDateRangeAdaptor}) into one of these pools. The pool then answers whether a given event + * {@link Instant} falls inside any configured window, which drives which timestamped Parquet files are + * selected for processing. Implementing {@link Serializable} lets it travel with Flink source state. + */ public class TimeRangePool implements Serializable { + /** + * Creates an empty pool that initially contains no time ranges. + */ public TimeRangePool() { this.timeRanges = new ArrayList<>(); } + /** Backing list of configured time windows, in the order they were added. */ @Getter private List timeRanges; + /** + * Appends a time range to the pool. + * + * @param timeRange the window to add + * @return {@code true}, since the backing {@link List} always grows + */ public boolean add(TimeRange timeRange) { return timeRanges.add(timeRange); } + /** + * Reports whether the given instant falls within any window in this pool. + * + * @param instant the timestamp to test + * @return {@code true} if at least one {@link TimeRange} contains {@code instant}, else {@code false} + */ public boolean contains(Instant instant) { return timeRanges.stream().anyMatch(timeRange -> timeRange.contains(instant)); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerCustom.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerCustom.java index 14860175c..83b87e970 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerCustom.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerCustom.java @@ -18,7 +18,13 @@ */ public class FlinkKafkaConsumerCustom extends FlinkKafkaConsumer { + /** + * Dagger configuration used to build the {@link ErrorReporter} when consumption fails. + */ private Configuration configuration; + /** + * Reporter used to publish fatal exceptions raised while consuming Kafka records. + */ private ErrorReporter errorReporter; /** @@ -35,6 +41,17 @@ public FlinkKafkaConsumerCustom(Pattern subscriptionPattern, KafkaDeserializatio this.configuration = configuration; } + /** + * {@inheritDoc} + * + *

Delegates to {@code runBaseConsumer} and, on any non-chained failure, reports the + * exception as fatal through the {@link ErrorReporter} before rethrowing it. An + * {@code ExceptionInChainedOperatorException} is rethrown unchanged so that downstream + * operator failures are not masked. + * + * @param sourceContext the Flink source context that emitted records are written to + * @throws Exception if the underlying Kafka consumer fails + */ @Override public void run(SourceContext sourceContext) throws Exception { try { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerDaggerSource.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerDaggerSource.java index 5e17b0a63..05cca3c63 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerDaggerSource.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/flinkkafkaconsumer/FlinkKafkaConsumerDaggerSource.java @@ -16,32 +16,86 @@ import static com.gotocompany.dagger.core.source.config.models.SourceName.KAFKA_CONSUMER; import static com.gotocompany.dagger.core.source.config.models.SourceType.UNBOUNDED; +/** + * {@link DaggerSource} implementation backed by Flink's legacy {@code FlinkKafkaConsumer} API. + * + *

This source is selected for unbounded streams whose {@code SOURCE_DETAILS} declare a single + * {@link SourceName#KAFKA_CONSUMER} source of type {@link SourceType#UNBOUNDED}. It wraps the + * configured topic pattern, Kafka properties, and deserializer into a + * {@link FlinkKafkaConsumerCustom} and registers it on the Flink execution environment. Newer jobs + * should prefer {@code KafkaDaggerSource} (the {@code KafkaSource}-based implementation); this + * variant is retained for backwards compatibility. + */ public class FlinkKafkaConsumerDaggerSource implements DaggerSource { + /** + * Deserializer applied to each Kafka record; must also be a {@code KafkaDeserializationSchema}. + */ private final DaggerDeserializer deserializer; + /** + * The per-stream configuration supplying the topic pattern and Kafka properties. + */ private final StreamConfig streamConfig; + /** + * The global Dagger job configuration used when resolving Kafka properties. + */ private final Configuration configuration; + /** + * The single source name this implementation supports ({@code KAFKA_CONSUMER}). + */ private static final SourceName SUPPORTED_SOURCE_NAME = KAFKA_CONSUMER; + /** + * The single source type this implementation supports ({@code UNBOUNDED}). + */ private static final SourceType SUPPORTED_SOURCE_TYPE = UNBOUNDED; + /** + * Creates a source from the given stream configuration, job configuration, and deserializer. + * + * @param streamConfig the per-stream configuration carrying the topic pattern and Kafka props + * @param configuration the global Dagger job configuration + * @param deserializer the record deserializer; expected to also implement + * {@code KafkaDeserializationSchema} + */ public FlinkKafkaConsumerDaggerSource(StreamConfig streamConfig, Configuration configuration, DaggerDeserializer deserializer) { this.streamConfig = streamConfig; this.configuration = configuration; this.deserializer = deserializer; } + /** + * Builds the underlying {@link FlinkKafkaConsumerCustom} from the stream configuration. + * + *

The deserializer is cast to a {@code KafkaDeserializationSchema} and combined with the + * configured topic pattern and resolved Kafka properties. + * + * @return a configured {@link FlinkKafkaConsumerCustom} ready to be added to the environment + */ FlinkKafkaConsumerCustom buildSource() { KafkaDeserializationSchema kafkaDeserializationSchema = (KafkaDeserializationSchema) deserializer; return new FlinkKafkaConsumerCustom(streamConfig.getTopicPattern(), kafkaDeserializationSchema, streamConfig.getKafkaProps(configuration), configuration); } + /** + * {@inheritDoc} + * + *

Builds the {@link FlinkKafkaConsumerCustom}, applies the watermark strategy to it, and adds + * it to the execution environment via {@code addSource}. + */ @Override public DataStream register(StreamExecutionEnvironment executionEnvironment, WatermarkStrategy watermarkStrategy) { FlinkKafkaConsumerCustom source = buildSource(); return executionEnvironment.addSource(source.assignTimestampsAndWatermarks(watermarkStrategy)); } + /** + * {@inheritDoc} + * + *

Returns {@code true} only when exactly one {@code SOURCE_DETAILS} entry is configured with + * source name {@link SourceName#KAFKA_CONSUMER} and type {@link SourceType#UNBOUNDED}, and the + * deserializer is a {@code KafkaDeserializationSchema}. + */ @Override public boolean canBuild() { SourceDetails[] sourceDetailsArray = streamConfig.getSourceDetails(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/kafka/KafkaDaggerSource.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/kafka/KafkaDaggerSource.java index 550b6fb23..9f20b0771 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/kafka/KafkaDaggerSource.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/kafka/KafkaDaggerSource.java @@ -15,19 +15,60 @@ import org.apache.flink.streaming.connectors.kafka.KafkaDeserializationSchema; import org.apache.flink.types.Row; +/** + * {@link DaggerSource} implementation backed by Flink's modern {@link KafkaSource} connector + * (the FLIP-27 source API). + * + *

This is the preferred Kafka source for unbounded Dagger streams. It is selected when the + * configured {@code SOURCE_DETAILS} declare a single {@link SourceName#KAFKA_SOURCE} of type + * {@link SourceType#UNBOUNDED}. The configured topic pattern, starting offsets, Kafka properties, + * and deserializer are assembled into a {@link KafkaSource} that is then attached to the execution + * environment via {@code fromSource}. + */ public class KafkaDaggerSource implements DaggerSource { + /** + * Deserializer applied to each Kafka record; must also be a {@code KafkaDeserializationSchema}. + */ private final DaggerDeserializer deserializer; + /** + * The per-stream configuration supplying the topic pattern, starting offsets, and Kafka props. + */ private final StreamConfig streamConfig; + /** + * The global Dagger job configuration used when resolving Kafka properties. + */ private final Configuration configuration; + /** + * The single source name this implementation supports ({@code KAFKA_SOURCE}). + */ private static final SourceName SUPPORTED_SOURCE_NAME = SourceName.KAFKA_SOURCE; + /** + * The single source type this implementation supports ({@code UNBOUNDED}). + */ private static final SourceType SUPPORTED_SOURCE_TYPE = SourceType.UNBOUNDED; + /** + * Creates a source from the given stream configuration, job configuration, and deserializer. + * + * @param streamConfig the per-stream configuration carrying the topic pattern, offsets and props + * @param configuration the global Dagger job configuration + * @param deserializer the record deserializer; expected to also implement + * {@code KafkaDeserializationSchema} + */ public KafkaDaggerSource(StreamConfig streamConfig, Configuration configuration, DaggerDeserializer deserializer) { this.streamConfig = streamConfig; this.configuration = configuration; this.deserializer = deserializer; } + /** + * Builds the underlying Flink {@link KafkaSource} from the stream configuration. + * + *

The deserializer is adapted to a {@code KafkaRecordDeserializationSchema} and the source is + * configured with the topic pattern, starting offsets, and Kafka client properties. + * + * @return a configured {@link KafkaSource} of {@code Row} + */ KafkaSource buildSource() { KafkaRecordDeserializationSchema kafkaRecordDeserializationSchema = KafkaRecordDeserializationSchema .of((KafkaDeserializationSchema) deserializer); @@ -39,11 +80,24 @@ KafkaSource buildSource() { .build(); } + /** + * {@inheritDoc} + * + *

Registers the built {@link KafkaSource} on the environment via {@code fromSource}, using the + * supplied watermark strategy and the stream's schema table as the source name. + */ @Override public DataStream register(StreamExecutionEnvironment executionEnvironment, WatermarkStrategy watermarkStrategy) { return executionEnvironment.fromSource(buildSource(), watermarkStrategy, streamConfig.getSchemaTable()); } + /** + * {@inheritDoc} + * + *

Returns {@code true} only when exactly one {@code SOURCE_DETAILS} entry is configured with + * source name {@link SourceName#KAFKA_SOURCE} and type {@link SourceType#UNBOUNDED}, and the + * deserializer is a {@code KafkaDeserializationSchema}. + */ @Override public boolean canBuild() { SourceDetails[] sourceDetailsArray = streamConfig.getSourceDetails(); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetDaggerSource.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetDaggerSource.java index f61b3c7ae..32655fa6d 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetDaggerSource.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetDaggerSource.java @@ -31,15 +31,56 @@ import static com.gotocompany.dagger.core.source.config.models.SourceName.PARQUET_SOURCE; import static com.gotocompany.dagger.core.source.config.models.SourceType.BOUNDED; +/** + * {@link DaggerSource} implementation that reads bounded batches of records from Parquet files via + * Flink's {@code FileSource}. + * + *

It is selected when the configured {@code SOURCE_DETAILS} declare a single + * {@link SourceName#PARQUET_SOURCE} of type {@link SourceType#BOUNDED} and the deserializer is a + * {@link SimpleGroupDeserializer}. The source discovers the configured Parquet paths, builds a + * {@link ParquetFileRecordFormat} (which yields a {@code ParquetReader} per file), and orders the + * resulting splits according to the configured {@link SourceParquetReadOrderStrategy} — currently + * only chronological ordering via {@link ChronologyOrderedSplitAssigner} is supported. Fatal + * configuration errors are reported through StatsD before being thrown. + */ public class ParquetDaggerSource implements DaggerSource { + /** + * Deserializer applied to each Parquet record; must be a {@link SimpleGroupDeserializer}. + */ private final DaggerDeserializer deserializer; + /** + * The per-stream configuration supplying the Parquet paths, date range, and read-order strategy. + */ private final StreamConfig streamConfig; + /** + * The global Dagger job configuration. + */ private final Configuration configuration; + /** + * Supplier of the StatsD reporter propagated into the file source, readers, and split assigner. + */ private final SerializedStatsDReporterSupplier statsDReporterSupplier; + /** + * The single source type this implementation supports ({@code BOUNDED}). + */ private static final SourceType SUPPORTED_SOURCE_TYPE = BOUNDED; + /** + * The single source name this implementation supports ({@code PARQUET_SOURCE}). + */ private static final SourceName SUPPORTED_SOURCE_NAME = PARQUET_SOURCE; + /** + * Error reporter used to surface fatal configuration errors to StatsD. + */ private final StatsDErrorReporter statsDErrorReporter; + /** + * Creates a Parquet source from the given configuration, deserializer, and StatsD supplier. + * + * @param streamConfig the per-stream configuration carrying Parquet paths and ordering + * @param configuration the global Dagger job configuration + * @param deserializer the record deserializer; expected to be a {@link SimpleGroupDeserializer} + * @param statsDReporterSupplier supplier of the StatsD reporter used for error/metric reporting + */ public ParquetDaggerSource(StreamConfig streamConfig, Configuration configuration, DaggerDeserializer deserializer, SerializedStatsDReporterSupplier statsDReporterSupplier) { this.streamConfig = streamConfig; this.configuration = configuration; @@ -48,11 +89,25 @@ public ParquetDaggerSource(StreamConfig streamConfig, Configuration configuratio this.statsDErrorReporter = new StatsDErrorReporter(statsDReporterSupplier); } + /** + * {@inheritDoc} + * + *

Builds the bounded Parquet {@code FileSource} and registers it on the environment via + * {@code fromSource}, using the supplied watermark strategy and the stream's schema table as the + * source name. + */ @Override public DataStream register(StreamExecutionEnvironment executionEnvironment, WatermarkStrategy watermarkStrategy) { return executionEnvironment.fromSource(buildFileSource(), watermarkStrategy, streamConfig.getSchemaTable()); } + /** + * {@inheritDoc} + * + *

Returns {@code true} only when exactly one {@code SOURCE_DETAILS} entry is configured with + * source name {@link SourceName#PARQUET_SOURCE} and type {@link SourceType#BOUNDED}, and the + * deserializer is a {@link SimpleGroupDeserializer}. + */ @Override public boolean canBuild() { SourceDetails[] sourceDetailsArray = streamConfig.getSourceDetails(); @@ -66,6 +121,15 @@ public boolean canBuild() { } } + /** + * Assembles the Flink {@code FileSource} for the configured Parquet inputs. + * + *

Wires together the file paths, the {@link ParquetFileRecordFormat}, the source type, and the + * read-order-derived {@code FileSplitAssigner.Provider} through {@link ParquetFileSource.Builder}, + * then delegates to {@link ParquetFileSource#buildFileSource()}. + * + * @return the configured Flink {@code FileSource} of {@code Row} + */ FileSource buildFileSource() { ParquetFileSource.Builder parquetFileSourceBuilder = ParquetFileSource.Builder.getInstance(); ParquetFileRecordFormat parquetFileRecordFormat = buildParquetFileRecordFormat(); @@ -82,6 +146,11 @@ FileSource buildFileSource() { return parquetFileSource.buildFileSource(); } + /** + * Converts the configured Parquet path strings into Flink {@link Path} instances. + * + * @return an array of Flink {@code Path}s, one per configured Parquet file path + */ private Path[] buildFlinkFilePaths() { String[] parquetFilePaths = streamConfig.getParquetFilePaths(); return Arrays.stream(parquetFilePaths) @@ -89,6 +158,17 @@ private Path[] buildFlinkFilePaths() { .toArray(Path[]::new); } + /** + * Selects the split-assigner provider matching the configured read-order strategy. + * + *

For {@link SourceParquetReadOrderStrategy#EARLIEST_TIME_URL_FIRST} it returns a provider that + * builds a {@link ChronologyOrderedSplitAssigner} configured with the parquet date range, a + * {@link HourDatePathParser}, and the StatsD supplier. The index-ordered strategy is not yet + * supported: a {@link DaggerConfigurationException} is reported to StatsD and thrown. + * + * @return a {@code FileSplitAssigner.Provider} for the configured ordering strategy + * @throws DaggerConfigurationException if the configured read-order strategy is unsupported + */ private FileSplitAssigner.Provider buildParquetFileSplitAssignerProvider() { SourceParquetReadOrderStrategy readOrderStrategy = streamConfig.getParquetFilesReadOrderStrategy(); switch (readOrderStrategy) { @@ -107,6 +187,15 @@ private FileSplitAssigner.Provider buildParquetFileSplitAssignerProvider() { } } + /** + * Builds the {@link ParquetFileRecordFormat} that produces a reader per Parquet file. + * + *

Wraps the {@link SimpleGroupDeserializer} in a {@link ParquetReader.ParquetReaderProvider} + * and supplies a serializable {@code Supplier} of the produced {@code TypeInformation} so the + * record format can advertise its output type to Flink. + * + * @return the configured {@code ParquetFileRecordFormat} + */ private ParquetFileRecordFormat buildParquetFileRecordFormat() { SimpleGroupDeserializer simpleGroupDeserializer = (SimpleGroupDeserializer) deserializer; ReaderProvider parquetFileReaderProvider = new ParquetReader.ParquetReaderProvider(simpleGroupDeserializer, statsDReporterSupplier); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileRecordFormat.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileRecordFormat.java index d7966ac82..5ca3eff00 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileRecordFormat.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileRecordFormat.java @@ -14,27 +14,85 @@ import java.io.Serializable; import java.util.function.Supplier; +/** + * Flink {@link FileRecordFormat} that reads Parquet files and emits their rows as Flink + * {@code Row}s. + * + *

For each split, {@link #createReader} delegates to a {@link ReaderProvider} (typically a + * {@code ParquetReader.ParquetReaderProvider}) to open a reader over the file. The format is + * non-splittable — a Parquet file is always read whole by a single reader — and advertises its + * produced {@code TypeInformation} via a supplied provider. Because Parquet readers are not + * offset-based, {@link #restoreReader} is unsupported. All error reporting goes through a + * serializable {@code Supplier} of {@link StatsDErrorReporter} so the format can be safely shipped + * as part of the Flink job graph. + */ public class ParquetFileRecordFormat implements FileRecordFormat { /* FileRecordFormat object and all it's fields need to be serializable in order to construct the Flink job graph. Even though StatsDErrorReporter is serializable, it contains StatsDErrorReporter, which in turn contains more fields which may not be serializable.Hence, in order to mitigate job graph creation failures, we wrap the error reporter inside a serializable lambda. This is a common idiom to make un-serializable fields serializable in Java 8: https://stackoverflow.com/a/22808112 */ + /** + * Factory that opens a reader over a given Parquet file path. + */ private final ReaderProvider parquetFileReaderProvider; + /** + * Supplies the {@code TypeInformation} describing the {@code Row}s produced by this format. + */ private final Supplier> typeInformationProvider; + /** + * Serializable supplier of the StatsD error reporter used to report fatal errors. + */ private final Supplier statsDErrorReporterSupplier; + /** + * Creates a record format; use {@link Builder} to construct one. + * + *

The supplied StatsD reporter supplier is wrapped in a serializable lambda so the resulting + * error reporter can be created lazily on the task managers. + * + * @param parquetFileReaderProvider the provider that opens a reader per Parquet file + * @param typeInformationProvider supplier of the produced {@code Row} type information + * @param statsDReporterSupplier supplier of the StatsD reporter for error reporting + */ private ParquetFileRecordFormat(ReaderProvider parquetFileReaderProvider, Supplier> typeInformationProvider, SerializedStatsDReporterSupplier statsDReporterSupplier) { this.parquetFileReaderProvider = parquetFileReaderProvider; this.typeInformationProvider = typeInformationProvider; this.statsDErrorReporterSupplier = (Supplier & Serializable) () -> new StatsDErrorReporter(statsDReporterSupplier); } + /** + * {@inheritDoc} + * + *

Opens a fresh Parquet reader for the given file via the {@link ReaderProvider}. The split + * offset and length are ignored because the format is non-splittable and Parquet files are read + * in full. + * + * @param config the Flink configuration (unused) + * @param filePath the path of the Parquet file to read + * @param splitOffset the split start offset (ignored) + * @param splitLength the split length (ignored) + * @return a reader over the whole Parquet file + */ @Override public Reader createReader(Configuration config, Path filePath, long splitOffset, long splitLength) { return parquetFileReaderProvider.getReader(filePath.toString()); } + /** + * {@inheritDoc} + * + *

Unsupported for Parquet: readers have no notion of offsets and therefore cannot be restored + * to a previous position. Always reports a fatal error and throws. + * + * @param config the Flink configuration (unused) + * @param filePath the path of the Parquet file (unused) + * @param restoredOffset the offset to restore from (unused) + * @param splitOffset the split start offset (unused) + * @param splitLength the split length (unused) + * @return never returns normally + * @throws UnsupportedOperationException always, since Parquet readers cannot be restored by offset + */ @Override public Reader restoreReader(Configuration config, Path filePath, long restoredOffset, long splitOffset, long splitLength) { UnsupportedOperationException ex = new UnsupportedOperationException("Error: ParquetReader do not have offsets and hence cannot be restored " @@ -43,46 +101,109 @@ public Reader restoreReader(Configuration config, Path filePath, long resto throw ex; } + /** + * {@inheritDoc} + * + *

Always returns {@code false}: a Parquet file is read in its entirety by a single reader and + * is never broken into sub-splits. + * + * @return {@code false}, indicating the format is not splittable + */ @Override public boolean isSplittable() { return false; } + /** + * {@inheritDoc} + * + *

Returns the {@code Row} type information supplied at construction time. + * + * @return the {@code TypeInformation} of the produced rows + */ @Override public TypeInformation getProducedType() { return typeInformationProvider.get(); } + /** + * Fluent builder for {@link ParquetFileRecordFormat} that validates required dependencies before + * constructing the format. + */ public static class Builder { + /** + * The provider that opens a reader per Parquet file; required. + */ private ReaderProvider parquetFileReaderProvider; + /** + * Supplier of the produced {@code Row} type information; required. + */ private Supplier> typeInformationProvider; + /** + * Supplier of the StatsD reporter for error reporting; required. + */ private SerializedStatsDReporterSupplier statsDReporterSupplier; + /** + * Creates a new builder with all dependencies unset. + * + * @return a fresh {@link Builder} instance + */ public static Builder getInstance() { return new Builder(); } + /** + * Initializes an empty builder; all dependencies must be set before {@link #build()}. + */ private Builder() { this.parquetFileReaderProvider = null; this.typeInformationProvider = null; this.statsDReporterSupplier = null; } + /** + * Sets the provider that opens a reader per Parquet file. + * + * @param parquetFileReaderProvider the reader provider + * @return this builder + */ public Builder setParquetFileReaderProvider(ReaderProvider parquetFileReaderProvider) { this.parquetFileReaderProvider = parquetFileReaderProvider; return this; } + /** + * Sets the supplier of the produced {@code Row} type information. + * + * @param typeInformationProvider the type information supplier + * @return this builder + */ public Builder setTypeInformationProvider(Supplier> typeInformationProvider) { this.typeInformationProvider = typeInformationProvider; return this; } + /** + * Sets the supplier of the StatsD reporter used for error reporting. + * + * @param statsDReporterSupplier the StatsD reporter supplier + * @return this builder + */ public Builder setStatsDReporterSupplier(SerializedStatsDReporterSupplier statsDReporterSupplier) { this.statsDReporterSupplier = statsDReporterSupplier; return this; } + /** + * Validates that all dependencies are set and constructs the {@link ParquetFileRecordFormat}. + * + *

If any required dependency is missing, the resulting {@link IllegalArgumentException} is + * reported to StatsD (when a supplier is available) and rethrown. + * + * @return the built {@code ParquetFileRecordFormat} + * @throws IllegalArgumentException if any required dependency is {@code null} + */ public ParquetFileRecordFormat build() { try { checkArgument(parquetFileReaderProvider != null, "ReaderProvider is required but is set as null"); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileSource.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileSource.java index 5141ca227..259e3cbd4 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileSource.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/ParquetFileSource.java @@ -17,18 +17,51 @@ import static com.google.api.client.util.Preconditions.checkArgument; import static com.gotocompany.dagger.core.source.config.models.SourceType.BOUNDED; +/** + * Immutable, serializable description of everything required to build a Flink {@code FileSource} + * over Parquet files, together with a fluent {@link Builder}. + * + *

Instances are produced by {@link ParquetDaggerSource} and hold the file paths, the + * {@link FileRecordFormat} that turns each file into {@code Row}s, the {@link FileSplitAssigner} + * provider that decides the processing order of splits, the source {@link SourceType}, and the + * Dagger configuration. It is {@link Serializable} so it can participate in the Flink job graph. + */ public class ParquetFileSource implements Serializable { + /** + * Whether the source is bounded or unbounded; only {@link SourceType#BOUNDED} is supported. + */ @Getter private final SourceType sourceType; + /** + * The Parquet file paths to read from. + */ @Getter private final Path[] filePaths; + /** + * The Dagger job configuration associated with this source. + */ @Getter private final Configuration configuration; + /** + * The record format that decodes each Parquet file into Flink {@code Row}s. + */ @Getter private final FileRecordFormat fileRecordFormat; + /** + * Provider of the split assigner that controls the order in which file splits are processed. + */ @Getter private final FileSplitAssigner.Provider fileSplitAssigner; + /** + * Creates an immutable Parquet file source description; use {@link Builder} to construct one. + * + * @param sourceType the boundedness of the source (expected to be {@code BOUNDED}) + * @param configuration the Dagger job configuration + * @param fileRecordFormat the record format decoding Parquet files into {@code Row}s + * @param filePaths the Parquet file paths to read + * @param fileSplitAssigner the provider of the file split assigner controlling read order + */ private ParquetFileSource(SourceType sourceType, Configuration configuration, FileRecordFormat fileRecordFormat, @@ -41,24 +74,63 @@ private ParquetFileSource(SourceType sourceType, this.fileSplitAssigner = fileSplitAssigner; } + /** + * Builds the Flink {@link FileSource} from this description. + * + *

Uses {@code FileSource.forRecordFileFormat} with the configured record format and file + * paths, applying the configured split assigner. + * + * @return the constructed Flink {@code FileSource} of {@code Row} + */ public FileSource buildFileSource() { return FileSource.forRecordFileFormat(fileRecordFormat, filePaths) .setSplitAssigner(fileSplitAssigner) .build(); } + /** + * Fluent builder for {@link ParquetFileSource} that applies sensible defaults and validates the + * configuration before constructing the immutable source. + */ public static class Builder { + /** + * The source boundedness; defaults to {@link SourceType#BOUNDED}. + */ private SourceType sourceType; + /** + * The Parquet file paths to read; defaults to an empty array. + */ private Path[] filePaths; + /** + * The record format decoding Parquet files into {@code Row}s; required. + */ private FileRecordFormat fileRecordFormat; + /** + * The Dagger job configuration. + */ private Configuration configuration; + /** + * Provider of the split assigner; defaults to Flink's {@code LocalityAwareSplitAssigner}. + */ private FileSplitAssigner.Provider fileSplitAssigner; + /** + * Supplier of the StatsD reporter; required for error reporting during validation. + */ private SerializedStatsDReporterSupplier statsDReporterSupplier; + /** + * Creates a new builder pre-populated with default values. + * + * @return a fresh {@link Builder} instance + */ public static Builder getInstance() { return new Builder(); } + /** + * Initializes the builder defaults: a {@code BOUNDED} source with no paths, no record + * format, and Flink's locality-aware split assigner. + */ private Builder() { this.sourceType = SourceType.BOUNDED; this.configuration = null; @@ -67,31 +139,67 @@ private Builder() { this.fileSplitAssigner = LocalityAwareSplitAssigner::new; } + /** + * Sets the source boundedness. + * + * @param sourceType the source type; only {@link SourceType#BOUNDED} is supported + * @return this builder + */ public Builder setSourceType(SourceType sourceType) { this.sourceType = sourceType; return this; } + /** + * Sets the record format that decodes Parquet files into {@code Row}s. + * + * @param fileRecordFormat the record format to use + * @return this builder + */ public Builder setFileRecordFormat(FileRecordFormat fileRecordFormat) { this.fileRecordFormat = fileRecordFormat; return this; } + /** + * Sets the provider of the split assigner controlling the order splits are processed in. + * + * @param fileSplitAssigner the split assigner provider + * @return this builder + */ public Builder setFileSplitAssigner(FileSplitAssigner.Provider fileSplitAssigner) { this.fileSplitAssigner = fileSplitAssigner; return this; } + /** + * Sets the Parquet file paths to read from. + * + * @param filePaths the file paths + * @return this builder + */ public Builder setFilePaths(Path[] filePaths) { this.filePaths = filePaths; return this; } + /** + * Sets the Dagger job configuration. + * + * @param configuration the configuration + * @return this builder + */ public Builder setConfiguration(Configuration configuration) { this.configuration = configuration; return this; } + /** + * Sets the supplier of the StatsD reporter used for error reporting during validation. + * + * @param statsDReporterSupplier the StatsD reporter supplier + * @return this builder + */ public Builder setStatsDReporterSupplier(SerializedStatsDReporterSupplier statsDReporterSupplier) { this.statsDReporterSupplier = statsDReporterSupplier; return this; @@ -99,6 +207,17 @@ public Builder setStatsDReporterSupplier(SerializedStatsDReporterSupplier statsD /* other validations if required before creating the file source can be put here */ /* for example, checking that all the file paths conform to just one partitioning strategy */ + /** + * Validates that all required builder fields are set and the source type is supported. + * + *

Ensures the StatsD supplier, record format, and at least one file path are present and + * that the source type is {@link SourceType#BOUNDED} (unbounded Parquet sources are not yet + * supported). On failure the {@link IllegalArgumentException} is reported to StatsD (when a + * supplier is available) and rethrown. + * + * @throws IllegalArgumentException if a required field is missing or the source type is not + * {@code BOUNDED} + */ private void sanityCheck() { try { checkArgument(statsDReporterSupplier != null, "SerializedStatsDReporterSupplier is required but is set as null"); @@ -113,6 +232,12 @@ private void sanityCheck() { } } + /** + * Validates the builder state and constructs the immutable {@link ParquetFileSource}. + * + * @return the built {@code ParquetFileSource} + * @throws IllegalArgumentException if {@link #sanityCheck()} fails + */ public ParquetFileSource build() { sanityCheck(); return new ParquetFileSource(sourceType, diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetReadOrderStrategy.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetReadOrderStrategy.java index 5fe43b6bb..95471053b 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetReadOrderStrategy.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetReadOrderStrategy.java @@ -5,9 +5,24 @@ import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_EARLIEST_INDEX_FIRST; import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_EARLIEST_TIME_URL_FIRST; +/** + * Strategy controlling the order in which discovered Parquet files (splits) are handed to readers. + * + *

Configured via the parquet read-order stream property; the {@link SerializedName} annotations + * bind each constant to its configuration string during Gson deserialization. + * {@code ParquetDaggerSource} uses the selected strategy to pick the matching split assigner. + */ public enum SourceParquetReadOrderStrategy { + /** + * Process splits in ascending chronological order of the timestamp parsed from each file path + * (earliest partition time first), backed by the {@code ChronologyOrderedSplitAssigner}. + */ @SerializedName(STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_EARLIEST_TIME_URL_FIRST) EARLIEST_TIME_URL_FIRST, + /** + * Process splits in their discovery (index) order; not yet supported and currently rejected at + * source-construction time. + */ @SerializedName(STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_EARLIEST_INDEX_FIRST) EARLIEST_INDEX_FIRST } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetSchemaMatchStrategy.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetSchemaMatchStrategy.java index c4b9c78ec..6ad75d370 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetSchemaMatchStrategy.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/SourceParquetSchemaMatchStrategy.java @@ -5,9 +5,23 @@ import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_PARQUET_BACKWARD_COMPATIBLE_SCHEMA_MATCH_STRATEGY; import static com.gotocompany.dagger.core.utils.Constants.STREAM_SOURCE_PARQUET_SAME_SCHEMA_MATCH_STRATEGY; +/** + * Strategy controlling how the schema of the Parquet files being read is matched against the + * expected (configured) message schema. + * + *

Configured via the parquet schema-match stream property; the {@link SerializedName} annotations + * bind each constant to its configuration string during Gson deserialization. + */ public enum SourceParquetSchemaMatchStrategy { + /** + * Require the file schema to be identical to the expected schema, failing on any mismatch. + */ @SerializedName(STREAM_SOURCE_PARQUET_SAME_SCHEMA_MATCH_STRATEGY) SAME_SCHEMA_WITH_FAIL_ON_MISMATCH, + /** + * Allow backward-compatible schema differences (for example added or removed fields), failing + * only when a common field's data type does not match. + */ @SerializedName(STREAM_SOURCE_PARQUET_BACKWARD_COMPATIBLE_SCHEMA_MATCH_STRATEGY) BACKWARD_COMPATIBLE_SCHEMA_WITH_FAIL_ON_TYPE_MISMATCH } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/HourDatePathParser.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/HourDatePathParser.java index 55dc465c1..cd271f70f 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/HourDatePathParser.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/HourDatePathParser.java @@ -11,7 +11,27 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +/** + * {@link PathParser} for the {@code dt=YYYY-MM-DD[/hr=HH]} Hive-style partition layout. + * + *

It extracts a UTC {@link Instant} from a Parquet file path by matching a {@code dt=} date + * segment and an optional {@code hr=} hour segment. When the hour segment is present the instant is + * resolved to the hour; otherwise it falls back to midnight UTC of the matched date. Paths that + * match neither shape cause a {@link ParseException}. The class is {@link Serializable} so it can be + * captured by the split assigner within the Flink job graph. + */ public class HourDatePathParser implements PathParser, Serializable { + /** + * {@inheritDoc} + * + *

Matches the path against {@code dt=YYYY-MM-DD} and an optional {@code hr=HH} segment. If + * both date and hour are present the resulting {@link Instant} is at hour granularity; if only + * the date is present it is midnight UTC of that date. + * + * @param path the Parquet file path to parse + * @return the UTC {@link Instant} derived from the path's date (and optional hour) + * @throws ParseException if the path matches no supported partitioning scheme + */ @Override public Instant instantFromFilePath(Path path) throws ParseException { Pattern filePathPattern = Pattern.compile("^.*/dt=([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])/(hr=([0-9][0-9]))?.*$"); @@ -30,12 +50,27 @@ public Instant instantFromFilePath(Path path) throws ParseException { } } + /** + * Parses a date-only partition segment into a UTC {@link Instant} at midnight. + * + * @param dateSegment the date string in {@code yyyy-MM-dd} format + * @return the {@link Instant} at 00:00 UTC of the given date + * @throws ParseException if {@code dateSegment} cannot be parsed + */ private Instant convertToInstant(String dateSegment) throws ParseException { SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd"); simpleDateFormat.setTimeZone(TimeZone.getTimeZone(ZoneOffset.UTC)); return simpleDateFormat.parse(dateSegment).toInstant(); } + /** + * Parses a date and hour partition pair into a UTC {@link Instant} at hour granularity. + * + * @param dateSegment the date string in {@code yyyy-MM-dd} format + * @param hourSegment the two-digit hour-of-day string ({@code HH}) + * @return the {@link Instant} at the given date and hour in UTC + * @throws ParseException if the combined date and hour cannot be parsed + */ private Instant convertToInstant(String dateSegment, String hourSegment) throws ParseException { SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH"); simpleDateFormat.setTimeZone(TimeZone.getTimeZone(ZoneOffset.UTC)); diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/PathParser.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/PathParser.java index bf893bba6..bfb26d407 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/PathParser.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/path/PathParser.java @@ -5,7 +5,21 @@ import java.text.ParseException; import java.time.Instant; +/** + * Extracts an event-time {@link Instant} from a Parquet file {@link Path} based on its partition + * layout. + * + *

Implementations decode the date/hour partition encoded in the file path so that the bounded + * Parquet source can order file splits chronologically (see {@code ChronologyOrderedSplitAssigner}). + */ public interface PathParser { + /** + * Parses the timestamp encoded in the given file path's partition segments. + * + * @param path the Parquet file path to parse + * @return the {@link Instant} represented by the path's date/hour partition + * @throws ParseException if the path does not conform to a recognised partitioning scheme + */ Instant instantFromFilePath(Path path) throws ParseException; } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ParquetReader.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ParquetReader.java index 52bb817ac..2c8efe295 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ParquetReader.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ParquetReader.java @@ -32,21 +32,84 @@ import java.io.IOException; import java.time.Instant; +/** + * Flink {@code FileRecordFormat.Reader} that streams the rows of a single Parquet file as Flink + * {@code Row}s. + * + *

The reader iterates the Parquet file one row group at a time, decodes each record into a + * {@link SimpleGroup}, and converts it to a {@code Row} via the {@link SimpleGroupDeserializer}. + * Read throughput and deserialization latency are recorded through Dagger's StatsD histogram and + * counter managers, and any deserialization failure is reported as a fatal error. Because Parquet + * files have no byte offsets suitable for arbitrary resumption, the checkpointed position only + * tracks the number of emitted records. Instances are created by the nested + * {@link ParquetReaderProvider}. + */ public class ParquetReader implements FileRecordFormat.Reader { + /** + * The Hadoop path of the Parquet file being read. + */ private final Path hadoopFilePath; + /** + * Deserializer converting each Parquet {@link SimpleGroup} into a Flink {@code Row}. + */ private final SimpleGroupDeserializer simpleGroupDeserializer; + /** + * Index of the next record to read within the current row group. + */ private long currentRecordIndex; + /** + * The underlying Parquet file reader that supplies row groups. + */ private final ParquetFileReader parquetFileReader; + /** + * Number of records in the current row group. + */ private long rowCount; + /** + * Whether the per-row-group record reader has been initialized. + */ private boolean isRecordReaderInitialized; + /** + * Reader over the records of the current row group. + */ private RecordReader recordReader; + /** + * The Parquet message schema of the file. + */ private final MessageType schema; + /** + * Running count of records emitted so far; used as the checkpointed position. + */ private long totalEmittedRowCount; + /** + * Counter manager recording reader lifecycle and row-emission metrics. + */ private DaggerCounterManager daggerCounterManager; + /** + * Histogram manager recording per-row read and deserialization timings. + */ private DaggerHistogramManager daggerHistogramManager; + /** + * Reporter used to surface fatal deserialization errors to StatsD. + */ private final StatsDErrorReporter statsDErrorReporter; + /** + * Logger for reader lifecycle events. + */ private static final Logger LOGGER = LoggerFactory.getLogger(ParquetReader.class.getName()); + /** + * Creates a reader over an already-opened Parquet file; use {@link ParquetReaderProvider}. + * + *

Reads the file schema from the reader's metadata, registers the StatsD measurement + * managers, and increments the reader-created counter. + * + * @param hadoopFilePath the Hadoop path of the Parquet file + * @param simpleGroupDeserializer the deserializer converting groups to {@code Row}s + * @param parquetFileReader the opened Parquet file reader + * @param statsDReporterSupplier supplier of the StatsD reporter for metrics and errors + * @throws IOException if reading the file metadata fails + */ private ParquetReader(Path hadoopFilePath, SimpleGroupDeserializer simpleGroupDeserializer, ParquetFileReader parquetFileReader, SerializedStatsDReporterSupplier statsDReporterSupplier) throws IOException { this.hadoopFilePath = hadoopFilePath; @@ -60,6 +123,11 @@ private ParquetReader(Path hadoopFilePath, SimpleGroupDeserializer simpleGroupDe daggerCounterManager.increment(ParquetReaderAspects.READER_CREATED); } + /** + * Initializes and registers the counter and histogram managers with the Parquet reader tags. + * + * @param statsDReporterSupplier supplier of the StatsD reporter the managers report through + */ private void registerTagsWithMeasurementManagers(SerializedStatsDReporterSupplier statsDReporterSupplier) { StatsDTag[] parquetReaderTags = ComponentTags.getParquetReaderTags(); this.daggerCounterManager = new DaggerCounterManager(statsDReporterSupplier); @@ -68,6 +136,13 @@ private void registerTagsWithMeasurementManagers(SerializedStatsDReporterSupplie this.daggerHistogramManager.register(parquetReaderTags); } + /** + * Checks whether a row group page is {@code null}, indicating the end of the file. + * + * @param page the next row group page, or {@code null} if none remain + * @return {@code true} (logging an end-of-data message) when {@code page} is {@code null}; + * {@code false} otherwise + */ private boolean checkIfNullPage(PageReadStore page) { if (page == null) { String logMessage = String.format("No more data found in Parquet file %s", hadoopFilePath.getName()); @@ -77,6 +152,14 @@ private boolean checkIfNullPage(PageReadStore page) { return false; } + /** + * Advances the reader onto a new row group, resetting the per-group cursor. + * + *

Updates the row count, resets the current record index to zero, and builds a fresh record + * reader for the supplied row group using the file schema. + * + * @param pages the row group to start reading from + */ private void changeReaderPosition(PageReadStore pages) { rowCount = pages.getRowCount(); currentRecordIndex = 0; @@ -84,6 +167,11 @@ private void changeReaderPosition(PageReadStore pages) { recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema)); } + /** + * Reads the first row group and prepares the reader for record consumption. + * + * @throws IOException if reading the first row group fails + */ private void initializeRecordReader() throws IOException { PageReadStore nextPage = parquetFileReader.readNextRowGroup(); changeReaderPosition(nextPage); @@ -92,6 +180,16 @@ private void initializeRecordReader() throws IOException { LOGGER.info(logMessage); } + /** + * Reads and deserializes the next record, advancing to the next row group when needed. + * + *

When the current row group is exhausted the next one is loaded; if none remain, + * {@code null} is returned to signal end of file. Read and deserialization durations are + * recorded to the histogram manager. + * + * @return the next record as a {@code Row}, or {@code null} when the file is fully read + * @throws IOException if reading the next row group fails + */ private Row readRecords() throws IOException { long startReadTime = Instant.now().toEpochMilli(); @@ -118,6 +216,14 @@ private Row readRecords() throws IOException { return row; } + /** + * Deserializes a single Parquet group into a Flink {@code Row}. + * + * @param simpleGroup the Parquet record to convert + * @return the deserialized {@code Row} + * @throws DaggerDeserializationException if deserialization fails; the error is reported to + * StatsD before being rethrown + */ private Row deserialize(SimpleGroup simpleGroup) { try { return simpleGroupDeserializer.deserialize(simpleGroup); @@ -127,6 +233,15 @@ private Row deserialize(SimpleGroup simpleGroup) { } } + /** + * {@inheritDoc} + * + *

Lazily initializes the record reader on the first call, reads the next record, and + * increments the rows-emitted counter. + * + * @return the next {@code Row}, or {@code null} when the file is fully read + * @throws IOException if reading fails + */ @Nullable @Override public Row read() throws IOException { @@ -138,6 +253,14 @@ public Row read() throws IOException { return row; } + /** + * {@inheritDoc} + * + *

Closes the underlying Parquet file reader, de-references the record reader, and increments + * the reader-closed counter. + * + * @throws IOException if closing the underlying file reader fails + */ @Override public void close() throws IOException { parquetFileReader.close(); @@ -147,6 +270,9 @@ public void close() throws IOException { daggerCounterManager.increment(ParquetReaderAspects.READER_CLOSED); } + /** + * Marks the record reader uninitialized and releases the reference to it. + */ private void closeRecordReader() { if (isRecordReaderInitialized) { this.isRecordReaderInitialized = false; @@ -154,20 +280,58 @@ private void closeRecordReader() { recordReader = null; } + /** + * {@inheritDoc} + * + *

Parquet files expose no usable byte offset, so the position reports + * {@code CheckpointedPosition.NO_OFFSET} together with the count of records emitted so far. + * + * @return the checkpointed position with no offset and the emitted-record count + */ @Override public CheckpointedPosition getCheckpointedPosition() { return new CheckpointedPosition(CheckpointedPosition.NO_OFFSET, totalEmittedRowCount); } + /** + * {@link ReaderProvider} that opens a {@link ParquetReader} for a given file path. + * + *

Serializable so it can be embedded in the {@link ParquetFileRecordFormat} and shipped with + * the Flink job graph; the actual {@code ParquetFileReader} is opened lazily on the task manager + * when {@link #getReader(String)} is invoked. + */ public static class ParquetReaderProvider implements ReaderProvider { + /** + * Deserializer handed to every {@link ParquetReader} this provider creates. + */ private final SimpleGroupDeserializer simpleGroupDeserializer; + /** + * Supplier of the StatsD reporter handed to every reader for metrics and error reporting. + */ private final SerializedStatsDReporterSupplier statsDReporterSupplier; + /** + * Creates a provider that builds readers from the given deserializer and StatsD supplier. + * + * @param simpleGroupDeserializer the deserializer converting Parquet groups to {@code Row}s + * @param statsDReporterSupplier supplier of the StatsD reporter for metrics and errors + */ public ParquetReaderProvider(SimpleGroupDeserializer simpleGroupDeserializer, SerializedStatsDReporterSupplier statsDReporterSupplier) { this.simpleGroupDeserializer = simpleGroupDeserializer; this.statsDReporterSupplier = statsDReporterSupplier; } + /** + * {@inheritDoc} + * + *

Opens the Parquet file at the given path with a Hadoop configuration and wraps it in a + * {@link ParquetReader}. Any failure is wrapped in a + * {@link ParquetFileSourceReaderInitializationException}, reported to StatsD, and rethrown. + * + * @param filePath the path of the Parquet file to open + * @return a new {@link ParquetReader} positioned at the start of the file + * @throws ParquetFileSourceReaderInitializationException if the file cannot be opened + */ @Override public ParquetReader getReader(String filePath) { try { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ReaderProvider.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ReaderProvider.java index 7a5b30478..a16323409 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ReaderProvider.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/reader/ReaderProvider.java @@ -5,7 +5,21 @@ import java.io.Serializable; +/** + * Serializable factory that creates a Flink {@code FileRecordFormat.Reader} of {@code Row} for a + * given file path. + * + *

Implementations (such as {@code ParquetReader.ParquetReaderProvider}) encapsulate how a reader + * is opened for a file. Being a {@link Serializable} {@link FunctionalInterface}, it can be passed + * lambda-style into the record format and shipped as part of the Flink job graph. + */ @FunctionalInterface public interface ReaderProvider extends Serializable { + /** + * Opens a reader over the file at the given path. + * + * @param filePath the path of the file to read + * @return a {@code FileRecordFormat.Reader} that yields {@code Row}s from the file + */ FileRecordFormat.Reader getReader(String filePath); } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/ChronologyOrderedSplitAssigner.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/ChronologyOrderedSplitAssigner.java index 593ea20b0..93a36315c 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/ChronologyOrderedSplitAssigner.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/ChronologyOrderedSplitAssigner.java @@ -24,14 +24,55 @@ import static com.google.api.client.util.Preconditions.checkArgument; +/** + * {@link FileSplitAssigner} that hands out Parquet file splits in ascending chronological order of + * the event-time {@link Instant} parsed from each split's file path. + * + *

Discovered splits are enriched into {@link InstantEnrichedSplit}s and held in a + * {@link PriorityBlockingQueue} ordered by their instant, so earlier partitions are always processed + * first. An optional {@link TimeRangePool} filters out splits whose instant falls outside the + * configured date ranges. Split discovery and assignment progress are exported as StatsD gauges, and + * a path that cannot be parsed results in a fatal, reported {@link IllegalArgumentException}. + * Instances are created through the nested {@link ChronologyOrderedSplitAssignerBuilder}. + */ public class ChronologyOrderedSplitAssigner implements FileSplitAssigner { + /** + * Priority queue of pending splits, ordered by ascending event-time instant. + */ private final PriorityBlockingQueue unassignedSplits; + /** + * Initial capacity of the priority queue (required by its constructor; not a hard limit). + */ private static final int INITIAL_DEFAULT_CAPACITY = 11; + /** + * Parser that extracts the event-time instant from each split's file path. + */ private PathParser pathParser; + /** + * Optional set of allowed time ranges; splits outside it are discarded. May be {@code null}. + */ private TimeRangePool timeRangePool; + /** + * Gauge manager exporting split discovery and assignment counts to StatsD. + */ private DaggerGaugeManager daggerGaugeManager; + /** + * Reporter used to surface fatal path-parsing errors to StatsD. + */ private final StatsDErrorReporter statsDErrorReporter; + /** + * Creates an assigner over the discovered splits; use + * {@link ChronologyOrderedSplitAssignerBuilder}. + * + *

Initializes the priority queue with the instant-based comparator and the gauge manager, then + * validates and enqueues the supplied splits. + * + * @param fileSourceSplits the splits discovered by Flink for this source + * @param pathParser the parser extracting an instant from each split's path + * @param timeRangePool optional allowed time ranges, or {@code null} to accept all + * @param statsDReporterSupplier supplier of the StatsD reporter for metrics and errors + */ private ChronologyOrderedSplitAssigner(Collection fileSourceSplits, PathParser pathParser, TimeRangePool timeRangePool, SerializedStatsDReporterSupplier statsDReporterSupplier) { this.pathParser = pathParser; @@ -42,6 +83,14 @@ private ChronologyOrderedSplitAssigner(Collection fileSourceSpl initAndValidate(fileSourceSplits); } + /** + * Registers split-assigner gauges and enqueues every discovered split after validation. + * + *

Records the total discovered, total recorded (after filtering), and awaiting-assignment + * counts as StatsD gauges. + * + * @param fileSourceSplits the splits discovered by Flink for this source + */ private void initAndValidate(Collection fileSourceSplits) { StatsDTag[] splitAssignerTags = ComponentTags.getSplitAssignerTags(); daggerGaugeManager.register(splitAssignerTags); @@ -53,6 +102,15 @@ private void initAndValidate(Collection fileSourceSplits) { daggerGaugeManager.markValue(ChronologyOrderedSplitAssignerAspects.SPLITS_AWAITING_ASSIGNMENT, unassignedSplits.size()); } + /** + * {@inheritDoc} + * + *

Polls the earliest (lowest-instant) split from the queue, updating the awaiting-assignment + * gauge. The {@code hostname} hint is ignored because ordering is purely chronological. + * + * @param hostname the host requesting a split, or {@code null}; ignored by this assigner + * @return the next split in chronological order, or {@link Optional#empty()} if none remain + */ @Override public Optional getNext(@Nullable String hostname) { InstantEnrichedSplit instantEnrichedSplit = unassignedSplits.poll(); @@ -63,6 +121,14 @@ public Optional getNext(@Nullable String hostname) { return Optional.of(instantEnrichedSplit.getFileSourceSplit()); } + /** + * {@inheritDoc} + * + *

Validates and re-enqueues the given splits (for example, splits returned by a failed + * reader), preserving the chronological ordering of the queue. + * + * @param splits the splits to add for (re)assignment + */ @Override public void addSplits(Collection splits) { for (FileSourceSplit split : splits) { @@ -70,6 +136,14 @@ public void addSplits(Collection splits) { } } + /** + * {@inheritDoc} + * + *

Returns the underlying {@link FileSourceSplit}s still awaiting assignment, unwrapped from + * their {@link InstantEnrichedSplit} holders. The returned order is not guaranteed to be sorted. + * + * @return the file source splits not yet assigned + */ @Override public Collection remainingSplits() { return unassignedSplits @@ -78,6 +152,15 @@ public Collection remainingSplits() { .collect(Collectors.toList()); } + /** + * Parses a split's instant and enqueues it when it falls within the configured time ranges. + * + *

Splits whose instant is outside the {@link TimeRangePool} are silently dropped. A path that + * cannot be parsed results in a fatal, reported {@link IllegalArgumentException}. + * + * @param split the split to validate and conditionally enqueue + * @throws IllegalArgumentException if the split's path cannot be parsed into an instant + */ private void validateAndAddSplits(FileSourceSplit split) { try { Instant instant = pathParser.instantFromFilePath(split.path()); @@ -91,6 +174,11 @@ private void validateAndAddSplits(FileSourceSplit split) { } } + /** + * Builds the comparator that orders splits by ascending event-time instant. + * + * @return a comparator placing earlier instants before later ones + */ private Comparator getFileSourceSplitComparator() { return (instantEnrichedSplit1, instantEnrichedSplit2) -> { Instant instant1 = instantEnrichedSplit1.getInstant(); @@ -105,26 +193,72 @@ private Comparator getFileSourceSplitComparator() { }; } + /** + * Serializable builder for {@link ChronologyOrderedSplitAssigner}. + * + *

The split collection is supplied later by Flink (via {@link #build(Collection)} used as a + * {@code FileSplitAssigner.Provider}), so only the path parser, optional date range, and StatsD + * supplier are configured here. + */ public static class ChronologyOrderedSplitAssignerBuilder implements Serializable { + /** + * Parser used to extract an instant from each split's file path; required. + */ private PathParser pathParser; + /** + * Optional allowed time ranges used to filter splits; may remain {@code null}. + */ private TimeRangePool parquetFileDateRange; + /** + * Supplier of the StatsD reporter for metrics and error reporting; required. + */ private SerializedStatsDReporterSupplier statsDReporterSupplier; + /** + * Sets the path parser used to derive each split's event-time instant. + * + * @param parser the path parser to use + * @return this builder + */ public ChronologyOrderedSplitAssignerBuilder addPathParser(PathParser parser) { this.pathParser = parser; return this; } + /** + * Sets the optional time ranges used to filter splits by their instant. + * + * @param timeRangePool the allowed time ranges, or {@code null} to accept all splits + * @return this builder + */ public ChronologyOrderedSplitAssignerBuilder addTimeRanges(TimeRangePool timeRangePool) { this.parquetFileDateRange = timeRangePool; return this; } + /** + * Sets the supplier of the StatsD reporter for metrics and error reporting. + * + * @param supplier the StatsD reporter supplier + * @return this builder + */ public ChronologyOrderedSplitAssignerBuilder addStatsDReporterSupplier(SerializedStatsDReporterSupplier supplier) { this.statsDReporterSupplier = supplier; return this; } + /** + * Builds the assigner for the splits Flink has discovered. + * + *

This method matches the {@code FileSplitAssigner.Provider} functional shape and is what + * {@code ParquetDaggerSource} passes to the file source. It requires a StatsD supplier and a + * path parser; a missing dependency is reported to StatsD and thrown. + * + * @param fileSourceSplits the splits discovered by Flink for this source + * @return a configured {@link ChronologyOrderedSplitAssigner} + * @throws IllegalArgumentException if no StatsD reporter supplier was configured + * @throws PathParserNotProvidedException if no path parser was configured + */ public ChronologyOrderedSplitAssigner build(Collection fileSourceSplits) { checkArgument(statsDReporterSupplier != null, "SerializedStatsDReporterSupplier is required but is set as null"); if (pathParser == null) { diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/IndexOrderedSplitAssigner.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/IndexOrderedSplitAssigner.java index e7db5af2c..a664b1444 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/IndexOrderedSplitAssigner.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/IndexOrderedSplitAssigner.java @@ -8,21 +8,57 @@ import java.util.Optional; /* TODO */ +/** + * Placeholder {@link FileSplitAssigner} intended to assign Parquet file splits in their index + * (discovery) order. + * + *

This implementation is not yet functional: the constructor ignores the provided splits and the + * accessor methods return empty or {@code null} results. It corresponds to the + * {@code EARLIEST_INDEX_FIRST} read-order strategy, which {@code ParquetDaggerSource} currently + * rejects as unsupported. It is retained as a stub for a future index-ordered assignment strategy. + */ public class IndexOrderedSplitAssigner implements FileSplitAssigner { + /** + * Creates a placeholder assigner; the supplied splits are currently ignored. + * + * @param fileSourceSplits the initial set of file splits (not yet used) + */ public IndexOrderedSplitAssigner(Collection fileSourceSplits) { } + /** + * {@inheritDoc} + * + *

Not yet implemented; always returns {@link Optional#empty()}. + * + * @param hostname the host requesting a split, or {@code null} if not host-local + * @return {@link Optional#empty()} always + */ @Override public Optional getNext(@Nullable String hostname) { return Optional.empty(); } + /** + * {@inheritDoc} + * + *

Not yet implemented; the supplied splits are ignored. + * + * @param splits the splits to add back for (re)assignment + */ @Override public void addSplits(Collection splits) { } + /** + * {@inheritDoc} + * + *

Not yet implemented; always returns {@code null}. + * + * @return {@code null} always + */ @Override public Collection remainingSplits() { return null; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/InstantEnrichedSplit.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/InstantEnrichedSplit.java index 7bbfb125f..99378d1b6 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/InstantEnrichedSplit.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/source/parquet/splitassigner/InstantEnrichedSplit.java @@ -6,12 +6,32 @@ import java.io.Serializable; import java.time.Instant; +/** + * A value object pairing a Flink {@link FileSourceSplit} with the {@link Instant} parsed from its + * file path. + * + *

Used by {@link ChronologyOrderedSplitAssigner} to keep file splits in a priority queue ordered + * by their event time so that Parquet files are processed chronologically. It is + * {@link Serializable} so it can participate in the Flink job graph. + */ public class InstantEnrichedSplit implements Serializable { + /** + * The wrapped Flink file source split. + */ @Getter private final FileSourceSplit fileSourceSplit; + /** + * The event-time instant derived from the split's file path, used for ordering. + */ @Getter private final Instant instant; + /** + * Creates a split enriched with its parsed event-time instant. + * + * @param fileSourceSplit the Flink file source split being wrapped + * @param instant the instant parsed from the split's file path + */ public InstantEnrichedSplit(FileSourceSplit fileSourceSplit, Instant instant) { this.fileSourceSplit = fileSourceSplit; this.instant = instant; diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/Constants.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/Constants.java index 2e658730d..e99f94c97 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/Constants.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/Constants.java @@ -1,160 +1,307 @@ package com.gotocompany.dagger.core.utils; +/** + * Centralized constant definitions for the Dagger core module. + * + *

Holds the configuration keys (and their default values) that drive Dagger's pre/post + * processors, Longbow BigTable lookups, Flink runtime tuning, Kafka/InfluxDB/BigQuery sinks, + * Kafka source consumers, telemetry, and the internal identifiers used while building the job + * pipeline. This is a constant holder and is not intended to be instantiated. + */ public class Constants { + /** Configuration key toggling the pre-processor stage. */ public static final String PROCESSOR_PREPROCESSOR_ENABLE_KEY = "PROCESSOR_PREPROCESSOR_ENABLE"; + /** Default for {@link #PROCESSOR_PREPROCESSOR_ENABLE_KEY}: pre-processing disabled. */ public static final boolean PROCESSOR_PREPROCESSOR_ENABLE_DEFAULT = false; + /** Configuration key holding the JSON definition of the pre-processor chain. */ public static final String PROCESSOR_PREPROCESSOR_CONFIG_KEY = "PROCESSOR_PREPROCESSOR_CONFIG"; + /** Configuration key toggling the post-processor stage. */ public static final String PROCESSOR_POSTPROCESSOR_ENABLE_KEY = "PROCESSOR_POSTPROCESSOR_ENABLE"; + /** Default for {@link #PROCESSOR_POSTPROCESSOR_ENABLE_KEY}: post-processing disabled. */ public static final boolean PROCESSOR_POSTPROCESSOR_ENABLE_DEFAULT = false; + /** Configuration key holding the JSON definition of the post-processor chain. */ public static final String PROCESSOR_POSTPROCESSOR_CONFIG_KEY = "PROCESSOR_POSTPROCESSOR_CONFIG"; + /** Column/configuration key naming the Longbow lookup duration. */ public static final String LONGBOW_DURATION_KEY = "longbow_duration"; + /** Column key marking the latest bound of a Longbow range lookup. */ public static final String LONGBOW_LATEST_KEY = "longbow_latest"; + /** Column key marking the earliest bound of a Longbow range lookup. */ public static final String LONGBOW_EARLIEST_KEY = "longbow_earliest"; + /** Configuration key for how long Longbow documents are retained. */ public static final String PROCESSOR_LONGBOW_DOCUMENT_DURATION_KEY = "PROCESSOR_LONGBOW_DOCUMENT_DURATION"; + /** Default for {@link #PROCESSOR_LONGBOW_DOCUMENT_DURATION_KEY}: {@code "90d"}. */ public static final String PROCESSOR_LONGBOW_DOCUMENT_DURATION_DEFAULT = "90d"; + /** Delimiter used when composing Longbow row keys. */ public static final String LONGBOW_DELIMITER = "#"; + /** Column key prefix identifying Longbow data columns. */ public static final String LONGBOW_DATA_KEY = "longbow_data"; + /** Column key under which the serialized protobuf payload is stored in Longbow. */ public static final String LONGBOW_PROTO_DATA_KEY = "proto_data"; + /** Configuration key for the GCP project hosting the Longbow BigTable instance. */ public static final String PROCESSOR_LONGBOW_GCP_PROJECT_ID_KEY = "PROCESSOR_LONGBOW_GCP_PROJECT_ID"; + /** Default for {@link #PROCESSOR_LONGBOW_GCP_PROJECT_ID_KEY}: {@code "default-gcp-project"}. */ public static final String PROCESSOR_LONGBOW_GCP_PROJECT_ID_DEFAULT = "default-gcp-project"; + /** Configuration key for the BigTable instance backing Longbow. */ public static final String PROCESSOR_LONGBOW_GCP_INSTANCE_ID_KEY = "PROCESSOR_LONGBOW_GCP_INSTANCE_ID"; + /** Default for {@link #PROCESSOR_LONGBOW_GCP_INSTANCE_ID_KEY}: {@code "default-gcp-project"}. */ public static final String PROCESSOR_LONGBOW_GCP_INSTANCE_ID_DEFAULT = "default-gcp-project"; + /** Configuration key for the BigTable table backing Longbow. */ public static final String PROCESSOR_LONGBOW_GCP_TABLE_ID_KEY = "PROCESSOR_LONGBOW_GCP_TABLE_ID"; + /** Default BigTable column family used by Longbow: {@code "ts"}. */ public static final String LONGBOW_COLUMN_FAMILY_DEFAULT = "ts"; + /** Default BigTable column qualifier used by Longbow: {@code "proto"}. */ public static final String LONGBOW_QUALIFIER_DEFAULT = "proto"; + /** Default Longbow asynchronous lookup timeout: {@code 15000} ms. */ public static final Long PROCESSOR_LONGBOW_ASYNC_TIMEOUT_DEFAULT = 15000L; + /** Configuration key for the Longbow asynchronous lookup timeout, in milliseconds. */ public static final String PROCESSOR_LONGBOW_ASYNC_TIMEOUT_KEY = "PROCESSOR_LONGBOW_ASYNC_TIMEOUT"; + /** Default capacity of the Longbow asynchronous lookup thread pool: {@code 30}. */ public static final Integer PROCESSOR_LONGBOW_THREAD_CAPACITY_DEFAULT = 30; + /** Configuration key for the Longbow asynchronous lookup thread-pool capacity. */ public static final String PROCESSOR_LONGBOW_THREAD_CAPACITY_KEY = "PROCESSOR_LONGBOW_THREAD_CAPACITY"; + /** Configuration key carrying the Dagger/Flink job name. */ public static final String DAGGER_NAME_KEY = "FLINK_JOB_ID"; + /** Default for {@link #DAGGER_NAME_KEY}: {@code "SQL Flink Job"}. */ public static final String DAGGER_NAME_DEFAULT = "SQL Flink Job"; + /** Reserved field name carrying a record's event timestamp. */ public static final String EVENT_TIMESTAMP = "event_timestamp"; + /** Suffix denoting a duration expressed in minutes. */ public static final String MINUTE_UNIT = "m"; + /** Suffix denoting a duration expressed in hours. */ public static final String HOUR_UNIT = "h"; + /** Suffix denoting a duration expressed in days. */ public static final String DAY_UNIT = "d"; + /** Configuration key holding the Flink SQL query to execute. */ public static final String FLINK_SQL_QUERY_KEY = "FLINK_SQL_QUERY"; + /** Default for {@link #FLINK_SQL_QUERY_KEY}: an empty query. */ public static final String FLINK_SQL_QUERY_DEFAULT = ""; + /** Default Flink job parallelism: {@code 1}. */ public static final int FLINK_PARALLELISM_DEFAULT = 1; + /** Configuration key for the Flink job parallelism. */ public static final String FLINK_PARALLELISM_KEY = "FLINK_PARALLELISM"; + /** Default maximum Flink parallelism (used when rescaling): {@code 50}. */ public static final int FLINK_PARALLELISM_MAX_DEFAULT = 50; + /** Configuration key for the maximum Flink parallelism used when rescaling. */ public static final String FLINK_PARALLELISM_MAX_KEY = "FLINK_PARALLELISM_MAX"; + /** Default auto-watermark emission interval: {@code 10000} ms. */ public static final int FLINK_WATERMARK_INTERVAL_MS_DEFAULT = 10000; + /** Configuration key for the auto-watermark emission interval, in milliseconds. */ public static final String FLINK_WATERMARK_INTERVAL_MS_KEY = "FLINK_WATERMARK_INTERVAL_MS"; + /** Default checkpointing interval: {@code 30000} ms. */ public static final long FLINK_CHECKPOINT_INTERVAL_MS_DEFAULT = 30000; + /** Configuration key for the checkpointing interval, in milliseconds. */ public static final String FLINK_CHECKPOINT_INTERVAL_MS_KEY = "FLINK_CHECKPOINT_INTERVAL_MS"; + /** Default checkpoint completion timeout: {@code 900000} ms (15 minutes). */ public static final long FLINK_CHECKPOINT_TIMEOUT_MS_DEFAULT = 900000; + /** Configuration key for the checkpoint completion timeout, in milliseconds. */ public static final String FLINK_CHECKPOINT_TIMEOUT_MS_KEY = "FLINK_CHECKPOINT_TIMEOUT_MS"; + /** Default minimum pause between checkpoints: {@code 5000} ms. */ public static final long FLINK_CHECKPOINT_MIN_PAUSE_MS_DEFAULT = 5000; + /** Configuration key for the minimum pause between checkpoints, in milliseconds. */ public static final String FLINK_CHECKPOINT_MIN_PAUSE_MS_KEY = "FLINK_CHECKPOINT_MIN_PAUSE_MS"; + /** Default number of concurrent checkpoints: {@code 1}. */ public static final int FLINK_CHECKPOINT_MAX_CONCURRENT_DEFAULT = 1; + /** Configuration key for the maximum number of concurrent checkpoints. */ public static final String FLINK_CHECKPOINT_MAX_CONCURRENT_KEY = "FLINK_CHECKPOINT_MAX_CONCURRENT"; + /** Default idle-state retention: {@code 10} minutes. */ public static final int FLINK_RETENTION_IDLE_STATE_MINUTE_DEFAULT = 10; + /** Configuration key for the idle-state retention time, in minutes. */ public static final String FLINK_RETENTION_IDLE_STATE_MINUTE_KEY = "FLINK_RETENTION_IDLE_STATE_MINUTE"; + /** Default watermark delay (allowed lateness): {@code 10000} ms. */ public static final long FLINK_WATERMARK_DELAY_MS_DEFAULT = 10000; + /** Configuration key for the watermark delay (allowed lateness), in milliseconds. */ public static final String FLINK_WATERMARK_DELAY_MS_KEY = "FLINK_WATERMARK_DELAY_MS"; + /** Default for {@link #FLINK_ROWTIME_ATTRIBUTE_NAME_KEY}: no rowtime attribute configured. */ public static final String FLINK_ROWTIME_ATTRIBUTE_NAME_DEFAULT = ""; + /** Configuration key naming the rowtime (event-time) attribute column. */ public static final String FLINK_ROWTIME_ATTRIBUTE_NAME_KEY = "FLINK_ROWTIME_ATTRIBUTE_NAME"; + /** Default for {@link #FLINK_WATERMARK_PER_PARTITION_ENABLE_KEY}: per-partition watermarks disabled. */ public static final boolean FLINK_WATERMARK_PER_PARTITION_ENABLE_DEFAULT = false; + /** Configuration key toggling per-partition watermark generation. */ public static final String FLINK_WATERMARK_PER_PARTITION_ENABLE_KEY = "FLINK_WATERMARK_PER_PARTITION_ENABLE"; + /** Default for {@link #FLINK_JOB_ID_KEY}: {@code "SQL Flink job"}. */ public static final String FLINK_JOB_ID_DEFAULT = "SQL Flink job"; + /** Configuration key carrying the Flink job identifier. */ public static final String FLINK_JOB_ID_KEY = "FLINK_JOB_ID"; + /** Synchronizer configuration key for the target BigTable table id. */ public static final String SYNCHRONIZER_BIGTABLE_TABLE_ID_KEY = "bigtable_table_id"; + /** Synchronizer configuration key for the input proto class name. */ public static final String SYNCHRONIZER_INPUT_CLASSNAME_KEY = "input_class_name"; + /** Synchronizer configuration key identifying the Longbow read key. */ public static final String SYNCHRONIZER_LONGBOW_READ_KEY = "longbow_read_key"; + /** Configuration key for the Kafka sink topic. */ public static final String SINK_KAFKA_TOPIC_KEY = "SINK_KAFKA_TOPIC"; + /** Configuration key for the Kafka sink bootstrap brokers. */ public static final String SINK_KAFKA_BROKERS_KEY = "SINK_KAFKA_BROKERS"; + /** Configuration key for the protobuf class used for the Kafka sink message key. */ public static final String SINK_KAFKA_PROTO_KEY = "SINK_KAFKA_PROTO_KEY"; + /** Configuration key for the protobuf class used for the Kafka sink message value. */ public static final String SINK_KAFKA_PROTO_MESSAGE_KEY = "SINK_KAFKA_PROTO_MESSAGE"; + /** Configuration key naming the Kafka sink stream. */ public static final String SINK_KAFKA_STREAM_KEY = "SINK_KAFKA_STREAM"; + /** Configuration key holding the JSON schema for the Kafka sink. */ public static final String SINK_KAFKA_JSON_SCHEMA_KEY = "SINK_KAFKA_JSON_SCHEMA"; + /** Configuration key selecting the Kafka sink data type, such as PROTO or JSON. */ public static final String SINK_KAFKA_DATA_TYPE = "SINK_KAFKA_DATA_TYPE"; + /** Configuration key toggling large-message production tuning for the Kafka sink. */ public static final String SINK_KAFKA_PRODUCE_LARGE_MESSAGE_ENABLE_KEY = "SINK_KAFKA_PRODUCE_LARGE_MESSAGE_ENABLE"; + /** Configuration key for the Kafka producer {@code linger.ms} on the sink. */ public static final String SINK_KAFKA_LINGER_MS_KEY = "SINK_KAFKA_LINGER_MS"; + /** Default for {@link #SINK_KAFKA_PRODUCE_LARGE_MESSAGE_ENABLE_KEY}: large-message tuning disabled. */ public static final boolean SINK_KAFKA_PRODUCE_LARGE_MESSAGE_ENABLE_DEFAULT = false; + /** Kafka producer property name for the sink compression type. */ public static final String SINK_KAFKA_COMPRESSION_TYPE_KEY = "compression.type"; + /** Kafka producer property name for the sink linger time. */ public static final String SINK_KAFKA_LINGER_MS_CONFIG_KEY = "linger.ms"; + /** Default Kafka sink compression type: {@code "snappy"}. */ public static final String SINK_KAFKA_COMPRESSION_TYPE_DEFAULT = "snappy"; + /** Kafka producer property name for the sink maximum request size. */ public static final String SINK_KAFKA_MAX_REQUEST_SIZE_KEY = "max.request.size"; + /** Default Kafka sink maximum request size: {@code 20971520} bytes (20 MB). */ public static final String SINK_KAFKA_MAX_REQUEST_SIZE_DEFAULT = "20971520"; + /** Default Kafka sink linger time: {@code "0"} ms. */ public static final String SINK_KAFKA_LINGER_MS_DEFAULT = "0"; + /** External post-processor type identifier for Elasticsearch. */ public static final String ES_TYPE = "ES"; + /** External post-processor type identifier for HTTP. */ public static final String HTTP_TYPE = "HTTP"; + /** External post-processor type identifier for PostgreSQL. */ public static final String PG_TYPE = "PG"; + /** External post-processor type identifier for gRPC. */ public static final String GRPC_TYPE = "GRPC"; + /** Sentinel selecting all columns in an external post-processor SQL path. */ public static final String SQL_PATH_SELECT_ALL_CONFIG_VALUE = "*"; + /** Internal key identifying the Longbow writer post-processor. */ public static final String LONGBOW_WRITER_PROCESSOR_KEY = "longbow_writer_processor"; + /** Internal key identifying the Longbow reader post-processor. */ public static final String LONGBOW_READER_PROCESSOR_KEY = "longbow_reader_processor"; + /** Internal key identifying the transform processor. */ public static final String TRANSFORM_PROCESSOR_KEY = "transform_processor"; + /** Class name of the built-in SQL transformer. */ public static final String SQL_TRANSFORMER_CLASS = "SQLTransformer"; + /** Stream configuration key for the index of the event-timestamp field in the input schema. */ public static final String STREAM_INPUT_SCHEMA_EVENT_TIMESTAMP_FIELD_INDEX_KEY = "INPUT_SCHEMA_EVENT_TIMESTAMP_FIELD_INDEX"; + /** Stream configuration key listing the Kafka source topic names. */ public static final String STREAM_SOURCE_KAFKA_TOPIC_NAMES_KEY = "SOURCE_KAFKA_TOPIC_NAMES"; + /** Stream configuration key naming the Kafka source stream. */ public static final String STREAM_INPUT_STREAM_NAME_KEY = "SOURCE_KAFKA_NAME"; + /** Stream configuration key holding the source-details JSON. */ public static final String STREAM_SOURCE_DETAILS_KEY = "SOURCE_DETAILS"; + /** Source-details key selecting the source type. */ public static final String STREAM_SOURCE_DETAILS_SOURCE_TYPE_KEY = "SOURCE_TYPE"; + /** Source-type value for a bounded (batch) source. */ public static final String STREAM_SOURCE_DETAILS_SOURCE_TYPE_BOUNDED = "BOUNDED"; + /** Source-type value for an unbounded (streaming) source. */ public static final String STREAM_SOURCE_DETAILS_SOURCE_TYPE_UNBOUNDED = "UNBOUNDED"; + /** Source-details key selecting the source implementation by name. */ public static final String STREAM_SOURCE_DETAILS_SOURCE_NAME_KEY = "SOURCE_NAME"; + /** Source-name value for the Flink Kafka source connector. */ public static final String STREAM_SOURCE_DETAILS_SOURCE_NAME_KAFKA = "KAFKA_SOURCE"; + /** Source-name value for the Parquet file source connector. */ public static final String STREAM_SOURCE_DETAILS_SOURCE_NAME_PARQUET = "PARQUET_SOURCE"; + /** Source-name value for the legacy Flink Kafka consumer. */ public static final String STREAM_SOURCE_DETAILS_SOURCE_NAME_KAFKA_CONSUMER = "KAFKA_CONSUMER"; + /** Stream configuration key listing the Parquet source file paths. */ public static final String STREAM_SOURCE_PARQUET_FILE_PATHS_KEY = "SOURCE_PARQUET_FILE_PATHS"; + /** Stream configuration key selecting the Parquet read-order strategy. */ public static final String STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_KEY = "SOURCE_PARQUET_READ_ORDER_STRATEGY"; + /** Parquet read-order value processing earliest-timestamp file URLs first. */ public static final String STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_EARLIEST_TIME_URL_FIRST = "EARLIEST_TIME_URL_FIRST"; + /** Parquet read-order value processing the earliest file index first. */ public static final String STREAM_SOURCE_PARQUET_READ_ORDER_STRATEGY_EARLIEST_INDEX_FIRST = "EARLIEST_INDEX_FIRST"; + /** Stream configuration key selecting the Parquet schema-match strategy. */ public static final String STREAM_SOURCE_PARQUET_SCHEMA_MATCH_STRATEGY_KEY = "SOURCE_PARQUET_SCHEMA_MATCH_STRATEGY"; + /** Stream configuration key restricting Parquet source files to a date range. */ public static final String STREAM_SOURCE_PARQUET_FILE_DATE_RANGE_KEY = "SOURCE_PARQUET_FILE_DATE_RANGE"; + /** Parquet schema-match value requiring an identical schema, failing on mismatch. */ public static final String STREAM_SOURCE_PARQUET_SAME_SCHEMA_MATCH_STRATEGY = "SAME_SCHEMA_WITH_FAIL_ON_MISMATCH"; + /** Parquet schema-match value allowing backward-compatible schemas, failing on type mismatch. */ public static final String STREAM_SOURCE_PARQUET_BACKWARD_COMPATIBLE_SCHEMA_MATCH_STRATEGY = "BACKWARD_COMPATIBLE_SCHEMA_WITH_FAIL_ON_TYPE_MISMATCH"; + /** Stream configuration key selecting the input data type, such as PROTO or JSON. */ public static final String STREAM_INPUT_DATATYPE = "INPUT_DATATYPE"; + /** Stream configuration key naming the event-timestamp field for JSON input. */ public static final String STREAM_INPUT_SCHEMA_JSON_EVENT_TIMESTAMP_FIELD_NAME_KEY = "INPUT_SCHEMA_JSON_EVENT_TIMESTAMP_FIELD_NAME"; + /** Stream configuration key holding the JSON schema for JSON input. */ public static final String STREAM_INPUT_SCHEMA_JSON_SCHEMA_KEY = "INPUT_SCHEMA_JSON_SCHEMA"; + /** Configuration key for the Kafka consumer {@code auto.offset.reset} policy. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_AUTO_OFFSET_RESET_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_AUTO_OFFSET_RESET"; + /** Default Kafka consumer offset-reset policy: {@code "latest"}. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_AUTO_OFFSET_RESET_DEFAULT = "latest"; + /** Configuration key for the Kafka consumer {@code enable.auto.commit} flag. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_AUTO_COMMIT_ENABLE_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_AUTO_COMMIT_ENABLE"; + /** Configuration key for the Kafka consumer group id. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_GROUP_ID_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_GROUP_ID"; + /** Configuration key for the Kafka consumer bootstrap servers. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_BOOTSTRAP_SERVERS_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_BOOTSTRAP_SERVERS"; + /** Configuration key for the Kafka consumer security protocol. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SECURITY_PROTOCOL_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SECURITY_PROTOCOL"; + /** Configuration key for the Kafka consumer SASL mechanism. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SASL_MECHANISM_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SASL_MECHANISM"; + /** Configuration key for the Kafka consumer SASL JAAS configuration. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SASL_JAAS_CONFIG_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SASL_JAAS_CONFIG"; + /** Configuration key prefix for additional raw Kafka consumer properties. */ public static final String SOURCE_KAFKA_CONSUMER_ADDITIONAL_CONFIGURATIONS = "SOURCE_KAFKA_CONSUMER_ADDITIONAL_CONFIGURATIONS"; + /** Configuration key for the Kafka consumer SSL key password. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEY_PASSWORD_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEY_PASSWORD"; + /** Configuration key for the Kafka consumer SSL keystore location. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_LOCATION_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_LOCATION"; + /** Configuration key for the Kafka consumer SSL keystore password. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_PASSWORD_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_PASSWORD"; + /** Configuration key for the Kafka consumer SSL keystore type. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_TYPE_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_KEYSTORE_TYPE"; + /** Configuration key for the Kafka consumer SSL truststore location. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_LOCATION_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_LOCATION"; + /** Configuration key for the Kafka consumer SSL truststore password. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_PASSWORD_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_PASSWORD"; + /** Configuration key for the Kafka consumer SSL truststore type. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_TYPE_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_TRUSTSTORE_TYPE"; + /** Configuration key for the Kafka consumer SSL protocol. */ public static final String SOURCE_KAFKA_CONSUMER_CONFIG_SSL_PROTOCOL_KEY = "SOURCE_KAFKA_CONSUMER_CONFIG_SSL_PROTOCOL"; + /** Configuration key toggling metric telemetry reporting. */ public static final String METRIC_TELEMETRY_ENABLE_KEY = "METRIC_TELEMETRY_ENABLE"; + /** Default for {@link #METRIC_TELEMETRY_ENABLE_KEY}: telemetry enabled. */ public static final boolean METRIC_TELEMETRY_ENABLE_VALUE_DEFAULT = true; + /** Configuration key for the telemetry shutdown grace period, in milliseconds. */ public static final String METRIC_TELEMETRY_SHUTDOWN_PERIOD_MS_KEY = "METRIC_TELEMETRY_SHUTDOWN_PERIOD_MS"; + /** Default telemetry shutdown grace period: {@code 10000} ms. */ public static final long METRIC_TELEMETRY_SHUTDOWN_PERIOD_MS_DEFAULT = 10000; + /** Metric group key under which fatal exceptions are reported. */ public static final String FATAL_EXCEPTION_METRIC_GROUP_KEY = "fatal.exception"; + /** Metric group key under which non-fatal exceptions are reported. */ public static final String NONFATAL_EXCEPTION_METRIC_GROUP_KEY = "non.fatal.exception"; + /** Configuration key listing the UDF function-factory classes to register. */ public static final String FUNCTION_FACTORY_CLASSES_KEY = "FUNCTION_FACTORY_CLASSES"; + /** Default for {@link #FUNCTION_FACTORY_CLASSES_KEY}: {@code "FunctionFactory"}. */ public static final String FUNCTION_FACTORY_CLASSES_DEFAULT = "FunctionFactory"; + /** Metric key counting records dropped by the InfluxDB sink for being late. */ public static final String SINK_INFLUX_LATE_RECORDS_DROPPED_KEY = "influx.late.records.dropped"; + /** Configuration key for the InfluxDB database name. */ public static final String SINK_INFLUX_DB_NAME_KEY = "SINK_INFLUX_DB_NAME"; + /** Default for {@link #SINK_INFLUX_DB_NAME_KEY}: empty. */ public static final String SINK_INFLUX_DB_NAME_DEFAULT = ""; + /** Configuration key for the InfluxDB retention policy. */ public static final String SINK_INFLUX_RETENTION_POLICY_KEY = "SINK_INFLUX_RETENTION_POLICY"; + /** Default for {@link #SINK_INFLUX_RETENTION_POLICY_KEY}: empty. */ public static final String SINK_INFLUX_RETENTION_POLICY_DEFAULT = ""; + /** Configuration key for the InfluxDB measurement name. */ public static final String SINK_INFLUX_MEASUREMENT_NAME_KEY = "SINK_INFLUX_MEASUREMENT_NAME"; + /** Default for {@link #SINK_INFLUX_MEASUREMENT_NAME_KEY}: empty. */ public static final String SINK_INFLUX_MEASUREMENT_NAME_DEFAULT = ""; // A custom job can use this configuration to get all Influx measurement names as a list @@ -169,42 +316,75 @@ public class Constants { // configuration without changing the code. public static final String SINK_INFLUX_RETENTION_POLICY_LIST_KEY = "SINK_INFLUX_RETENTION_POLICY_LIST"; + /** Configuration key for the InfluxDB connection URL. */ public static final String SINK_INFLUX_URL_KEY = "SINK_INFLUX_URL"; + /** Default for {@link #SINK_INFLUX_URL_KEY}: empty. */ public static final String SINK_INFLUX_URL_DEFAULT = ""; + /** Configuration key for the InfluxDB username. */ public static final String SINK_INFLUX_USERNAME_KEY = "SINK_INFLUX_USERNAME"; + /** Default for {@link #SINK_INFLUX_USERNAME_KEY}: empty. */ public static final String SINK_INFLUX_USERNAME_DEFAULT = ""; + /** Configuration key for the InfluxDB password. */ public static final String SINK_INFLUX_PASSWORD_KEY = "SINK_INFLUX_PASSWORD"; + /** Default for {@link #SINK_INFLUX_PASSWORD_KEY}: empty. */ public static final String SINK_INFLUX_PASSWORD_DEFAULT = ""; + /** Configuration key for the InfluxDB write batch size. */ public static final String SINK_INFLUX_BATCH_SIZE_KEY = "SINK_INFLUX_BATCH_SIZE"; + /** Default InfluxDB batch size: {@code 0} (batching disabled). */ public static final int SINK_INFLUX_BATCH_SIZE_DEFAULT = 0; + /** Configuration key for the InfluxDB flush interval, in milliseconds. */ public static final String SINK_INFLUX_FLUSH_DURATION_MS_KEY = "SINK_INFLUX_FLUSH_DURATION_MS"; + /** Default InfluxDB flush interval: {@code 0} (flush disabled). */ public static final int SINK_INFLUX_FLUSH_DURATION_MS_DEFAULT = 0; + /** Configuration key toggling the InfluxDB writer that uses row field names. */ public static final String SINK_INFLUX_USING_ROW_FIELD_NAMES_KEY = "SINK_INFLUX_WITH_ROW_NAMES_WRITER"; + /** Default for {@link #SINK_INFLUX_USING_ROW_FIELD_NAMES_KEY}: field-name writer disabled. */ public static final boolean SINK_INFLUX_USING_ROW_FIELD_NAMES_DEFAULT = false; + /** Configuration key toggling large-message consumption tuning for the Kafka source. */ public static final String SOURCE_KAFKA_CONSUME_LARGE_MESSAGE_ENABLE_KEY = "SOURCE_KAFKA_CONSUME_LARGE_MESSAGE_ENABLE"; + /** Default for {@link #SOURCE_KAFKA_CONSUME_LARGE_MESSAGE_ENABLE_KEY}: large-message tuning disabled. */ public static final boolean SOURCE_KAFKA_CONSUME_LARGE_MESSAGE_ENABLE_DEFAULT = false; + /** Kafka consumer property name for the maximum bytes fetched per partition. */ public static final String SOURCE_KAFKA_MAX_PARTITION_FETCH_BYTES_KEY = "max.partition.fetch.bytes"; + /** Default Kafka maximum partition fetch size: {@code 5242880} bytes (5 MB). */ public static final String SOURCE_KAFKA_MAX_PARTITION_FETCH_BYTES_DEFAULT = "5242880"; + /** Lower bound of the HTTP client-error status range: {@code 400}. */ public static final int CLIENT_ERROR_MIN_STATUS_CODE = 400; + /** Upper bound of the HTTP client-error status range: {@code 499}. */ public static final int CLIENT_ERROR_MAX_STATUS_CODE = 499; + /** Lower bound of the HTTP server-error status range: {@code 500}. */ public static final int SERVER_ERROR_MIN_STATUS_CODE = 500; + /** Upper bound of the HTTP server-error status range: {@code 599}. */ public static final int SERVER_ERROR_MAX_STATUS_CODE = 599; + /** Default maximum event-loop execution time for async I/O: {@code 10000} ms. */ public static final long MAX_EVENT_LOOP_EXECUTE_TIME_DEFAULT = 10000; + /** Number of extra columns Longbow appends to each output row: {@code 3}. */ public static final int LONGBOW_OUTPUT_ADDITIONAL_ARITY = 3; + /** + * Categories of variables an external post-processor can extract from a record when building + * an outbound request. + */ public enum ExternalPostProcessorVariableType { REQUEST_VARIABLES, HEADER_VARIABLES, QUERY_VARIABLES, ENDPOINT_VARIABLE }; + /** Configuration key for the BigQuery sink write batch size. */ public static final String SINK_BIGQUERY_BATCH_SIZE = "SINK_BIGQUERY_BATCH_SIZE"; + /** Default BigQuery sink batch size: {@code 500}. */ public static final int SINK_BIGQUERY_BATCH_SIZE_DEFAULT = 500; // Comma seperated error types public static final String SINK_ERROR_TYPES_FOR_FAILURE = "SINK_ERROR_TYPES_FOR_FAILURE"; + /** Default for {@link #SINK_ERROR_TYPES_FOR_FAILURE}: no error types treated as failures. */ public static final String SINK_ERROR_TYPES_FOR_FAILURE_DEFAULT = ""; + /** Kafka consumer security protocols supported by Dagger. */ public static final String[] SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SECURITY_PROTOCOL = {"SASL_PLAINTEXT", "SASL_SSL", "SSL"}; + /** Kafka consumer SASL mechanisms supported by Dagger. */ public static final String[] SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SASL_MECHANISM = {"PLAIN", "SCRAM-SHA-256", "SCRAM-SHA-512"}; + /** Kafka consumer SSL protocols supported by Dagger. */ public static final String[] SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_PROTOCOL = {"TLS", "TLSv1.1", "TLSv1.2", "TLSv1.3", "SSL", "SSLv2", "SSLv3"}; + /** Kafka consumer SSL keystore/truststore file types supported by Dagger. */ public static final String[] SUPPORTED_SOURCE_KAFKA_CONSUMER_CONFIG_SSL_STORE_FILE_TYPE = {"JKS", "PKCS12", "PEM"}; } diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/DescriptorsUtil.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/DescriptorsUtil.java index 8b3caa589..9c825d307 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/DescriptorsUtil.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/DescriptorsUtil.java @@ -13,6 +13,9 @@ */ public class DescriptorsUtil { + /** + * Logger used to record when a nested field descriptor cannot be resolved. + */ private static final Logger LOGGER = LoggerFactory.getLogger(DescriptorsUtil.class.getName()); /** diff --git a/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/KafkaConfigUtil.java b/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/KafkaConfigUtil.java index 14539a01c..74b2d11f5 100644 --- a/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/KafkaConfigUtil.java +++ b/dagger-core/src/main/java/com/gotocompany/dagger/core/utils/KafkaConfigUtil.java @@ -6,8 +6,29 @@ import java.util.Set; import java.util.regex.Matcher; +/** + * Utility for translating Dagger's namespaced configuration entries into native Kafka client + * properties. + * + *

Different Kafka connector roles (for example a source consumer or a sink producer) namespace + * their settings with a role-specific prefix described by {@link KafkaConnectorTypesMetadata}. This + * helper uses that metadata's pattern to pick the relevant entries and rewrite each key into the + * dotted property name understood by the Kafka client. + */ public class KafkaConfigUtil { + /** + * Extracts native Kafka client properties from a set of namespaced configuration entries. + * + *

Each key in {@code properties} is matched against the pattern from + * {@code kafkaConnectorTypesMetadata}; for matching keys the first capture group is lowercased + * and its underscore runs are collapsed into dots to form the Kafka property name (for example + * {@code GROUP_ID} becomes {@code group.id}). Non-matching keys are ignored. + * + * @param kafkaConnectorTypesMetadata the connector metadata supplying the key-matching pattern + * @param properties the namespaced configuration entries to translate + * @return the extracted Kafka client properties with normalized dotted keys + */ public static Properties parseKafkaConfiguration(KafkaConnectorTypesMetadata kafkaConnectorTypesMetadata, Properties properties) { Properties kafkaProperties = new Properties(); Set configKeys = properties.keySet(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/Constants.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/Constants.java index 0befc0b42..c9c146297 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/Constants.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/Constants.java @@ -1,48 +1,106 @@ package com.gotocompany.dagger.functions.common; +/** + * Centralised configuration keys, default values, and structural constants used across the Dagger + * functions module. + * + *

The entries here name the environment/configuration properties (and their fallback defaults) + * that drive the DART artifact store (GCS/OSS/COS), the Python UDF runtime, and the pluggable job + * builder, alongside a couple of sizes used by the feature UDFs. Defining them in one place keeps + * key names and defaults consistent wherever they are read. + */ public class Constants { + /** Number of distinct data-type columns expected in a serialised feature row. */ public static final Integer NUMBER_OF_DATA_TYPES_IN_FEATURE_ROW = 8; + /** Number of arguments (key, value, type) that make up a single feature accumulator entry. */ public static final Integer NUMBER_OF_ARGUMENTS_IN_FEATURE_ACCUMULATOR = 3; + /** + * Configuration key for the GCS project id backing the DART store. + * + * @deprecated superseded by the storage-agnostic {@link #UDF_DART_PROJECT_ID_KEY}. + */ @Deprecated public static final String UDF_DART_GCS_PROJECT_ID_KEY = "UDF_DART_GCS_PROJECT_ID"; + /** + * Default GCS project id (empty) for the DART store. + * + * @deprecated superseded by {@link #UDF_DART_PROJECT_ID_DEFAULT}. + */ @Deprecated public static final String UDF_DART_GCS_PROJECT_ID_DEFAULT = ""; + /** + * Configuration key for the GCS bucket id backing the DART store. + * + * @deprecated superseded by the storage-agnostic {@link #UDF_DART_BUCKET_ID_KEY}. + */ @Deprecated public static final String UDF_DART_GCS_BUCKET_ID_KEY = "UDF_DART_GCS_BUCKET_ID"; + /** + * Default GCS bucket id (empty) for the DART store. + * + * @deprecated superseded by {@link #UDF_DART_BUCKET_ID_DEFAULT}. + */ @Deprecated public static final String UDF_DART_GCS_BUCKET_ID_DEFAULT = ""; + /** Configuration key for the cloud project id hosting the DART artifact store. */ public static final String UDF_DART_PROJECT_ID_KEY = "UDF_DART_PROJECT_ID"; + /** Default DART project id, empty unless explicitly overridden. */ public static final String UDF_DART_PROJECT_ID_DEFAULT = ""; + /** Configuration key for the object-store bucket holding DART artifacts. */ public static final String UDF_DART_BUCKET_ID_KEY = "UDF_DART_BUCKET_ID"; + /** Default DART bucket id, empty unless explicitly overridden. */ public static final String UDF_DART_BUCKET_ID_DEFAULT = ""; + /** Configuration key selecting which object-store provider backs the DART store. */ public static final String UDF_STORE_PROVIDER_KEY = "UDF_STORE_PROVIDER"; + /** Provider value selecting Google Cloud Storage as the DART store. */ public static final String UDF_STORE_PROVIDER_GCS = "GCS"; + /** Provider value selecting Alibaba Cloud OSS as the DART store. */ public static final String UDF_STORE_PROVIDER_OSS = "OSS"; + /** Provider value selecting Tencent Cloud COS as the DART store. */ public static final String UDF_STORE_PROVIDER_COS = "COS"; + /** Configuration key holding the raw Python UDF configuration block. */ public static final String PYTHON_UDF_CONFIG = "PYTHON_UDF_CONFIG"; + /** Configuration key toggling whether Python UDFs are enabled. */ public static final String PYTHON_UDF_ENABLE_KEY = "PYTHON_UDF_ENABLE"; + /** Default for Python UDF support: disabled. */ public static final boolean PYTHON_UDF_ENABLE_DEFAULT = false; + /** Configuration key listing the Python source files ({@code .py}/{@code .zip}) to register. */ public static final String PYTHON_FILES_KEY = "PYTHON_FILES"; + /** Configuration key pointing to the Python requirements specification. */ public static final String PYTHON_REQUIREMENTS_KEY = "PYTHON_REQUIREMENTS"; + /** Configuration key listing Python archive files to distribute with the job. */ public static final String PYTHON_ARCHIVES_KEY = "PYTHON_ARCHIVES"; + /** Configuration key for the Arrow batch size used by Python function execution. */ public static final String PYTHON_FN_EXECUTION_ARROW_BATCH_SIZE_KEY = "PYTHON_FN_EXECUTION_ARROW_BATCH_SIZE"; + /** Default Arrow batch size for Python function execution. */ public static final Integer PYTHON_FN_EXECUTION_ARROW_BATCH_SIZE_DEFAULT = 10000; + /** Configuration key for the bundle size (records per bundle) of Python function execution. */ public static final String PYTHON_FN_EXECUTION_BUNDLE_SIZE_KEY = "PYTHON_FN_EXECUTION_BUNDLE_SIZE"; + /** Default bundle size for Python function execution. */ public static final Integer PYTHON_FN_EXECUTION_BUNDLE_SIZE_DEFAULT = 100000; + /** Configuration key for the maximum bundle time (milliseconds) of Python function execution. */ public static final String PYTHON_FN_EXECUTION_BUNDLE_TIME_KEY = "PYTHON_FN_EXECUTION_BUNDLE_TIME"; + /** Default maximum bundle time, in milliseconds, for Python function execution. */ public static final long PYTHON_FN_EXECUTION_BUNDLE_TIME_DEFAULT = 1000; + /** Configuration key for the Alibaba Cloud OSS endpoint used by the DART store. */ public static final String OSS_ENDPOINT = "OSS_ENDPOINT"; + /** Default OSS endpoint (Singapore region) used when none is configured. */ public static final String DEFAULT_OSS_ENDPOINT = "oss-ap-southeast-5.aliyuncs.com"; + /** Configuration key for the Tencent Cloud COS region used by the DART store. */ public static final String COS_REGION = "COS_REGION"; + /** Default COS region (Jakarta) used when none is configured. */ public static final String DEFAULT_COS_REGION = "ap-jakarta"; + /** Configuration key toggling the TKE OIDC credential provider for COS authentication. */ public static final String ENABLE_TKE_OIDC_PROVIDER = "ENABLE_TKE_OIDC_PROVIDER"; + /** Configuration key for the fully-qualified class name of the job builder to instantiate. */ public static final String JOB_BUILDER_FQCN_KEY = "JOB_BUILDER_FQCN"; + /** Default job builder implementation: the SQL-based Dagger job builder. */ public static final String DEFAULT_JOB_BUILDER_FQCN = "com.gotocompany.dagger.core.DaggerSqlJobBuilder"; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/CosLibClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/CosLibClient.java index c6a6bad71..c6f8279eb 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/CosLibClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/common/CosLibClient.java @@ -14,10 +14,20 @@ * Stateless class. */ public class CosLibClient { + /** + * The singleton instance of {@link CosLibClient}, exposed through the Lombok-generated + * {@code getInstance()} accessor and replaceable in tests via {@code testOnlySetInstance}. + */ @Getter private static CosLibClient instance = new CosLibClient(); + /** + * Name of the environment variable that holds the COS secret id used for authentication. + */ private static final String ENV_COS_SECRET_ID = "COS_SECRET_ID"; + /** + * Name of the environment variable that holds the COS secret key used for authentication. + */ private static final String ENV_COS_SECRET_KEY = "COS_SECRET_KEY"; // the credential provider provides short living token. If we have a libCosClient long living object with these diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/InvalidNumberOfArgumentsException.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/InvalidNumberOfArgumentsException.java index 0869d4e04..4466c55db 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/InvalidNumberOfArgumentsException.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/InvalidNumberOfArgumentsException.java @@ -4,6 +4,10 @@ * The class Exception for Invalid number of arguments. */ public class InvalidNumberOfArgumentsException extends RuntimeException { + /** + * Default message used when the exception is raised without an explicit reason, indicating that the + * number of arguments passed to the UDF must be divisible by three. + */ private static final String DEFAULT_ERROR_MESSAGE = "Invalid number of arguments given to Udf. Requires arguments that is divisible by 3."; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/OddNumberOfArgumentsException.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/OddNumberOfArgumentsException.java index 5f481a84a..b93eefa84 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/OddNumberOfArgumentsException.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/OddNumberOfArgumentsException.java @@ -5,6 +5,10 @@ */ public class OddNumberOfArgumentsException extends RuntimeException { + /** + * Default message used when the exception is raised without an explicit reason, indicating that the + * UDF received an odd number of arguments when an even count was required. + */ private static final String DEFAULT_ERROR_MESSAGE = "Odd number of arguments given to Udf. Requires even."; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/TagDoesNotExistException.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/TagDoesNotExistException.java index 6bdb1eae1..a287ce6bf 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/TagDoesNotExistException.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/exceptions/TagDoesNotExistException.java @@ -13,6 +13,12 @@ public TagDoesNotExistException(String message) { super(message); } + /** + * Instantiates a new Tag does not exist exception with a root cause. + * + * @param message the detail message describing which tag was missing + * @param cause the underlying cause of this exception + */ public TagDoesNotExistException(String message, Throwable cause) { super(message, cause); } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/ClearColumnTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/ClearColumnTransformer.java index 409ed1985..6d8103333 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/ClearColumnTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/ClearColumnTransformer.java @@ -17,8 +17,17 @@ * Allows to clear the specified column of data produced by the dagger. */ public class ClearColumnTransformer implements MapFunction, Transformer { + /** + * Transformation-argument key whose value names the column to be cleared. + */ private static final String TARGET_KEY_COLUMN_NAME = "targetColumnName"; + /** + * Name of the column whose value is replaced with an empty string by this transformer. + */ private final String targetColumnName; + /** + * Ordered names of the columns in the incoming {@link Row}, used to resolve the target column index. + */ private String[] columnNames; /** @@ -33,6 +42,16 @@ public ClearColumnTransformer(Map transformationArguments, Strin this.targetColumnName = transformationArguments.get(TARGET_KEY_COLUMN_NAME); } + /** + * Copies the incoming row and blanks out the configured target column. + * + *

Every field is copied from {@code inputRow} into a new {@link Row} of the same arity, and the + * field at the resolved target-column index is overwritten with an empty string. + * + * @param inputRow the row to transform + * @return a new row identical to {@code inputRow} except that the target column is set to an empty string + * @throws IllegalArgumentException if the configured target column is not present in the column names + */ @Override public Row map(Row inputRow) throws IllegalArgumentException { int targetFieldIndex = Arrays.asList(columnNames).indexOf(targetColumnName); @@ -47,6 +66,15 @@ public Row map(Row inputRow) throws IllegalArgumentException { return outputRow; } + /** + * Wires this map function into the streaming pipeline. + * + *

Applies this transformer as a {@link MapFunction} over the input data stream and returns a new + * {@link StreamInfo} that preserves the original column names. + * + * @param inputStreamInfo the incoming stream and its column metadata + * @return a {@link StreamInfo} wrapping the mapped data stream with the original column names + */ @Override public StreamInfo transform(StreamInfo inputStreamInfo) { DataStream inputStream = inputStreamInfo.getDataStream(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/DeDuplicationTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/DeDuplicationTransformer.java index 1980fde0c..7de8d1cb6 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/DeDuplicationTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/DeDuplicationTransformer.java @@ -21,9 +21,21 @@ * Allows to deduplicate data produced by the dagger. */ public class DeDuplicationTransformer extends RichFilterFunction implements Transformer { + /** + * Name of the keyed Flink state used to remember keys that have already been emitted. + */ private static final String DE_DUP_STATE = "DE_DUP_STATE"; + /** + * Index, within the configured column names, of the column whose value identifies a record. + */ private final int keyIndex; + /** + * Time-to-live in seconds after which a remembered key expires and a matching record is allowed again. + */ private final Integer ttlInSeconds; + /** + * Keyed state mapping a previously seen key to a marker, used to detect and drop duplicate records. + */ private MapState mapState; /** @@ -38,6 +50,15 @@ public DeDuplicationTransformer(Map transformationArguments, Str ttlInSeconds = Integer.valueOf(String.valueOf(transformationArguments.get("ttl_in_seconds"))); } + /** + * Wires this de-duplication filter into the streaming pipeline. + * + *

Keys the input stream by the configured key column and applies this {@link RichFilterFunction} + * so that only the first record seen for each key (within the configured TTL) is forwarded. + * + * @param inputStreamInfo the incoming stream and its column metadata + * @return a {@link StreamInfo} wrapping the de-duplicated data stream with the original column names + */ @Override public StreamInfo transform(StreamInfo inputStreamInfo) { DataStream inputStream = inputStreamInfo.getDataStream(); @@ -47,6 +68,16 @@ public StreamInfo transform(StreamInfo inputStreamInfo) { return new StreamInfo(outputStream, inputStreamInfo.getColumnNames()); } + /** + * Initialises the keyed de-duplication state when the operator starts. + * + *

Builds a {@link MapStateDescriptor} for the de-duplication state, configures a time-to-live so + * that remembered keys expire after the configured number of seconds, and obtains the backing + * {@link MapState} from the runtime context. + * + * @param internalFlinkConfig the Flink configuration supplied by the runtime + * @throws Exception if the superclass initialisation or state acquisition fails + */ @Override public void open(org.apache.flink.configuration.Configuration internalFlinkConfig) throws Exception { super.open(internalFlinkConfig); @@ -61,6 +92,17 @@ public void open(org.apache.flink.configuration.Configuration internalFlinkConfi mapState = getRuntimeContext().getMapState(deDupState); } + /** + * Decides whether a record should pass through based on whether its key was seen before. + * + *

Reads the key from the configured key column; if the key is absent from the state it is recorded + * and the record is kept, otherwise the record is treated as a duplicate and dropped. + * + * @param value the record being evaluated + * @return {@code true} if the key has not been seen before and the record should be kept, + * {@code false} if it is a duplicate + * @throws Exception if accessing the keyed state fails + */ @Override public boolean filter(Row value) throws Exception { String key = (String) value.getField(keyIndex); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureTransformer.java index 40aab4aed..169cbdf84 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureTransformer.java @@ -18,11 +18,29 @@ * Converts to feast Features from post processors. */ public class FeatureTransformer implements MapFunction, Transformer { + /** + * Number of fields in each generated feast feature row, namely the key, value and value type. + */ private static final int FEATURE_ROW_LENGTH = 3; + /** + * Transformation-argument key whose value names the column holding the feature key. + */ private static final String KEY_COLUMN_NAME = "keyColumnName"; + /** + * Transformation-argument key whose value names the column holding the feature value. + */ private static final String VALUE_COLUMN_NAME = "valueColumnName"; + /** + * Name of the column that supplies the feature key. + */ private final String keyColumn; + /** + * Name of the column that supplies the feature value and that receives the generated feature rows. + */ private final String valueColumn; + /** + * Ordered names of the columns in the incoming {@link Row}, used to resolve column indices. + */ private String[] columnNames; /** @@ -38,6 +56,18 @@ public FeatureTransformer(Map transformationArguments, String[] this.valueColumn = transformationArguments.get(VALUE_COLUMN_NAME); } + /** + * Builds feast feature rows from the configured key and value columns of the incoming row. + * + *

Resolves the key and value column indices, delegates to {@code FeatureUtils} to populate the + * feature rows from the key and value, copies all fields of {@code inputRow} into a new {@link Row} + * and replaces the value column with the generated array of feature rows. + * + * @param inputRow the row to transform + * @return a new row whose value column holds the generated feast feature rows + * @throws IllegalArgumentException if the configured key or value column does not exist + * @throws Exception if populating the feature rows fails + */ @Override public Row map(Row inputRow) throws Exception { int featureKeyIndex = Arrays.asList(columnNames).indexOf(keyColumn); @@ -59,6 +89,15 @@ public Row map(Row inputRow) throws Exception { return outputRow; } + /** + * Wires this map function into the streaming pipeline. + * + *

Applies this transformer as a {@link MapFunction} over the input data stream and returns a new + * {@link StreamInfo} that preserves the original column names. + * + * @param inputStreamInfo the incoming stream and its column metadata + * @return a {@link StreamInfo} wrapping the mapped data stream with the original column names + */ @Override public StreamInfo transform(StreamInfo inputStreamInfo) { DataStream inputStream = inputStreamInfo.getDataStream(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureWithTypeTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureWithTypeTransformer.java index 75730c833..0cb4f31e5 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureWithTypeTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/FeatureWithTypeTransformer.java @@ -19,6 +19,9 @@ */ public class FeatureWithTypeTransformer implements MapFunction, Transformer { + /** + * Handler that holds the feature configuration and builds the typed feature rows for each input row. + */ private FeatureWithTypeHandler featureWithTypeHandler; /** @@ -32,6 +35,16 @@ public FeatureWithTypeTransformer(Map transformationArguments, S this.featureWithTypeHandler = new FeatureWithTypeHandler(transformationArguments, columnNames); } + /** + * Builds typed feast feature rows for the incoming row and stores them in the output column. + * + *

Delegates to the {@link FeatureWithTypeHandler} to populate the feature rows, copies all fields + * of {@code inputRow} into a new {@link Row} and sets the configured output column to the generated + * array of feature rows. + * + * @param inputRow the row to transform + * @return a new row whose output column holds the generated typed feast feature rows + */ @Override public Row map(Row inputRow) { ArrayList featureRows = featureWithTypeHandler.populateFeatures(inputRow); @@ -44,6 +57,15 @@ public Row map(Row inputRow) { return outputRow; } + /** + * Wires this map function into the streaming pipeline. + * + *

Applies this transformer as a {@link MapFunction} over the input data stream and returns a new + * {@link StreamInfo} that preserves the original column names. + * + * @param inputStreamInfo the incoming stream and its column metadata + * @return a {@link StreamInfo} wrapping the mapped data stream with the original column names + */ @Override public StreamInfo transform(StreamInfo inputStreamInfo) { DataStream inputStream = inputStreamInfo.getDataStream(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/HashTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/HashTransformer.java index 00e492293..e84225274 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/HashTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/HashTransformer.java @@ -25,11 +25,29 @@ * Using SHA-256 hashing to encrypt data. */ public class HashTransformer extends RichMapFunction implements Serializable, Transformer { + /** + * Configuration key holding the proto class name of the output (sink) Kafka message. + */ private static final String SINK_KAFKA_PROTO_MESSAGE = "SINK_KAFKA_PROTO_MESSAGE"; + /** + * Transformation-argument key whose value lists the field paths to be hashed. + */ private static final String ENCRYPTION_FIELD_KEY = "maskColumns"; + /** + * Dot-separated field paths whose values must be masked using SHA-256 hashing. + */ private final List fieldsToHash; + /** + * Dagger context providing access to the job configuration. + */ private final DaggerContext daggerContext; + /** + * Ordered names of the top-level columns in the incoming {@link Row}. + */ private final String[] columnNames; + /** + * Mapping from each configured field path to the {@code RowHasher} that masks it. + */ private Map rowHasherMap; /** @@ -45,10 +63,25 @@ public HashTransformer(Map transformationArguments, String[] col this.daggerContext = daggerContext; } + /** + * Extracts the configured list of field paths to hash from the transformation arguments. + * + * @param transformationArguments the transformation arguments supplied to this transformer + * @return the list of dot-separated field paths to be masked + */ private ArrayList getFieldsToHash(Map transformationArguments) { return (ArrayList) transformationArguments.get(ENCRYPTION_FIELD_KEY); } + /** + * Lazily builds the field-path to hasher mapping when the operator starts. + * + *

If the hasher map has not yet been created it is built from the output proto descriptor before + * delegating to the superclass initialisation. + * + * @param internalFlinkConfig the Flink configuration supplied by the runtime + * @throws Exception if building the hasher map or the superclass initialisation fails + */ @Override public void open(org.apache.flink.configuration.Configuration internalFlinkConfig) throws Exception { if (this.rowHasherMap == null) { @@ -57,6 +90,15 @@ public void open(org.apache.flink.configuration.Configuration internalFlinkConfi super.open(internalFlinkConfig); } + /** + * Wires this hashing map function into the streaming pipeline. + * + *

Applies this {@link RichMapFunction} over the input data stream and returns a new + * {@link StreamInfo} that preserves the original column names. + * + * @param streamInfo the incoming stream and its column metadata + * @return a {@link StreamInfo} wrapping the mapped data stream with the original column names + */ @Override public StreamInfo transform(StreamInfo streamInfo) { DataStream inputStream = streamInfo.getDataStream(); @@ -81,6 +123,15 @@ protected Map createRowHasherMap() { return pathReader.fieldMaskingPath(fieldsToHash); } + /** + * Masks every configured field of the incoming row using its SHA-256 hasher. + * + *

Creates a copy of {@code inputRow} and, for each configured field path, applies the matching + * {@code RowHasher} to overwrite the field value with its hash. + * + * @param inputRow the row whose configured fields should be masked + * @return a copy of {@code inputRow} with the configured fields replaced by their hashed values + */ @Override public Row map(Row inputRow) { Row outPutRow = Row.copy(inputRow); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/InvalidRecordFilterTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/InvalidRecordFilterTransformer.java index dc8519025..158594d5f 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/InvalidRecordFilterTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/InvalidRecordFilterTransformer.java @@ -20,12 +20,30 @@ * Filter the invalid records produced by dagger. */ public class InvalidRecordFilterTransformer extends RichFilterFunction implements Transformer { + /** + * Name of the table whose invalid records this filter counts, used as a metric tag. + */ private final String tableName; + /** + * Index, within the configured column names, of the internal validation flag column. + */ private final int validationIndex; + /** + * Manager used to count the number of invalid records filtered out for this table. + */ private CounterStatsManager metricsManager = null; + /** + * Logger used to report how many invalid records have been filtered. + */ private static final Logger LOGGER = LoggerFactory.getLogger(InvalidRecordFilterTransformer.class.getName()); + /** + * Name of the internal column that carries the per-record validation flag. + */ protected static final String INTERNAL_VALIDATION_FILED = "__internal_validation_field__"; + /** + * Metric tag name used to group the filtered-record counter by table. + */ private static final String PER_TABLE = "per_table"; /** @@ -40,6 +58,15 @@ public InvalidRecordFilterTransformer(Map transformationArgument validationIndex = Arrays.asList(columnNames).indexOf(INTERNAL_VALIDATION_FILED); } + /** + * Registers the invalid-record counter metric when the operator starts. + * + *

Obtains the metric group from the runtime context and registers a counter for filtered invalid + * records, tagged with the configured table name. + * + * @param internalFlinkConfig the Flink configuration supplied by the runtime + * @throws Exception if metric registration fails + */ @Override public void open(org.apache.flink.configuration.Configuration internalFlinkConfig) throws Exception { MetricGroup metricGroup = getRuntimeContext().getMetricGroup(); @@ -47,6 +74,15 @@ public void open(org.apache.flink.configuration.Configuration internalFlinkConfi metricsManager.register(FilterAspects.FILTERED_INVALID_RECORDS, PER_TABLE, tableName); } + /** + * Keeps only records that are marked valid by the internal validation flag. + * + *

When the validation flag is {@code false} the invalid-record counter is incremented, a log line + * is emitted and the record is dropped, otherwise the record is kept. + * + * @param value the record being evaluated + * @return {@code true} if the record is valid and should be kept, {@code false} if it is invalid + */ @Override public boolean filter(Row value) { if (!(boolean) value.getField(validationIndex)) { @@ -59,6 +95,15 @@ public boolean filter(Row value) { return true; } + /** + * Wires this filter into the streaming pipeline. + * + *

Applies this {@link RichFilterFunction} over the input data stream and returns a new + * {@link StreamInfo} that preserves the original column names. + * + * @param streamInfo the incoming stream and its column metadata + * @return a {@link StreamInfo} wrapping the filtered data stream with the original column names + */ @Override public StreamInfo transform(StreamInfo streamInfo) { return new StreamInfo( diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/SQLTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/SQLTransformer.java index ab879958b..8c042e8f1 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/SQLTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/SQLTransformer.java @@ -21,11 +21,29 @@ * Enables to apply a SQL transformation on top of streaming data in post processors. */ public class SQLTransformer implements Serializable, Transformer { + /** + * Ordered names of the columns in the incoming {@link Row}, used to build the table schema. + */ private final String[] columnNames; + /** + * SQL query applied to the registered table to produce the transformed stream. + */ private final String sqlQuery; + /** + * Name under which the input stream is registered as a table for the SQL query. + */ private final String tableName; + /** + * Allowed lateness, in milliseconds, used when assigning watermarks for the rowtime attribute. + */ private final long allowedLatenessInMs; + /** + * Name of the column treated as the event-time (rowtime) attribute. + */ private static final String ROWTIME = "rowtime"; + /** + * Dagger context providing access to the Flink table environment. + */ private final DaggerContext daggerContext; /** @@ -43,6 +61,17 @@ public SQLTransformer(Map transformationArguments, String[] colu this.daggerContext = daggerContext; } + /** + * Applies the configured SQL query to the input stream and returns the resulting stream. + * + *

Builds the table schema from the column names, registers the input stream as a table (assigning + * a rowtime time attribute and watermarks when a rowtime column is present), runs the SQL query and + * converts the resulting retract stream back into an append-only stream of {@link Row} records. + * + * @param inputStreamInfo the incoming stream and its column metadata + * @return a {@link StreamInfo} wrapping the query-result stream together with the query output column names + * @throws IllegalArgumentException if no SQL query was provided in the transformation arguments + */ @Override public StreamInfo transform(StreamInfo inputStreamInfo) { DataStream inputStream = inputStreamInfo.getDataStream(); @@ -65,6 +94,15 @@ public StreamInfo transform(StreamInfo inputStreamInfo) { return new StreamInfo(outputStream, table.getSchema().getFieldNames()); } + /** + * Assigns timestamps and watermarks to the stream based on the rowtime field. + * + *

Uses a {@link StreamWatermarkAssigner} backed by a {@link RowtimeFieldWatermark} over the column + * names, applying the configured allowed lateness. + * + * @param inputStream the stream to which timestamps and watermarks are assigned + * @return the input stream with timestamps and watermarks assigned + */ private DataStream assignTimeAttribute(DataStream inputStream) { StreamWatermarkAssigner streamWatermarkAssigner = new StreamWatermarkAssigner(new RowtimeFieldWatermark(columnNames)); return streamWatermarkAssigner.assignTimeStampAndWatermark(inputStream, allowedLatenessInMs); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/feature/FeatureWithTypeHandler.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/feature/FeatureWithTypeHandler.java index 05c7c07ad..1dd82ef6d 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/feature/FeatureWithTypeHandler.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/feature/FeatureWithTypeHandler.java @@ -15,16 +15,43 @@ * The Feature with type handler. */ public class FeatureWithTypeHandler implements Serializable { + /** + * Transformation-argument key whose value names the output column that receives the feature rows. + */ private static final String OUTPUT_COLUMN_NAME_KEY = "outputColumnName"; + /** + * Transformation-argument key whose value lists the per-feature key, value and type definitions. + */ private static final String OUTPUT_COLUMN_DATA_KEY = "data"; + /** + * Key, within a feature definition, naming the column that supplies the feature key. + */ private static final String KEY_COLUMN_NAME = "keyColumnName"; + /** + * Key, within a feature definition, naming the column that supplies the feature value. + */ private static final String VALUE_COLUMN_NAME = "valueColumnName"; + /** + * Key, within a feature definition, naming the value type of the feature. + */ private static final String TYPE = "type"; + /** + * Number of fields in each generated feature row, namely the key, value and value type. + */ private static final int FEATURE_ROW_LENGTH = 3; + /** + * Ordered names of the columns in the incoming {@link Row}, used to resolve column indices. + */ private String[] inputColumns; + /** + * Per-feature definitions, each holding the key column, value column and value type. + */ private List> featureInfoList; + /** + * Name of the column that receives the generated array of feature rows. + */ private String outputColumnName; /** @@ -39,6 +66,14 @@ public FeatureWithTypeHandler(Map transformationArguments, Strin createFeatureInfoList(transformationArguments); } + /** + * Builds the list of feature definitions from the transformation arguments. + * + *

Reads the output-column data entries and, for each, records the key column, value column and + * value type as a {@link Tuple3} in the feature-info list. + * + * @param transformationArguments the transformation arguments supplied to this handler + */ private void createFeatureInfoList(Map transformationArguments) { featureInfoList = new ArrayList<>(); List> outputColumnData = (List>) transformationArguments.get(OUTPUT_COLUMN_DATA_KEY); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/filter/FilterAspects.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/filter/FilterAspects.java index 280464632..afa259eb9 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/filter/FilterAspects.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/filter/FilterAspects.java @@ -12,19 +12,41 @@ public enum FilterAspects implements Aspects { */ FILTERED_INVALID_RECORDS("filtered_invalid_records", AspectType.Counter); + /** + * Creates a filter aspect with its metric name and aspect type. + * + * @param value the metric name reported for this aspect + * @param type the {@link AspectType} describing how this aspect is measured + */ FilterAspects(String value, AspectType type) { this.value = value; this.type = type; } + /** + * Metric name reported for this aspect. + */ private String value; + /** + * Aspect type describing how this aspect is measured. + */ private AspectType type; + /** + * Returns the metric name reported for this filter aspect. + * + * @return the metric name of this filter aspect + */ @Override public String getValue() { return this.value; } + /** + * Returns the aspect type describing how this filter aspect is measured. + * + * @return the {@link AspectType} of this filter aspect + */ @Override public AspectType getAspectType() { return this.type; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/PathReader.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/PathReader.java index 5eb8ce0a8..2781264b5 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/PathReader.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/PathReader.java @@ -13,7 +13,13 @@ */ public class PathReader { + /** + * Root proto descriptor against which the configured field paths are resolved. + */ private static Descriptors.Descriptor parentDescriptor; + /** + * Ordered names of the top-level input columns, used to resolve the root index of each field path. + */ private List inputColumns; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/IntegerFieldHasher.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/IntegerFieldHasher.java index 5b8d1555c..da37448fe 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/IntegerFieldHasher.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/IntegerFieldHasher.java @@ -8,6 +8,9 @@ */ public class IntegerFieldHasher implements FieldHasher { + /** + * Dot-separated path identifying the field this hasher masks; for a primitive it holds a single segment. + */ private final String[] fieldPath; /** @@ -19,6 +22,13 @@ public IntegerFieldHasher(String[] fieldPath) { this.fieldPath = fieldPath; } + /** + * Hashes the given integer value using SHA-256 and returns the hashed integer. + * + * @param elem the integer field value to hash + * @return the hashed integer value + * @throws RowHashException if the value cannot be hashed as an integer + */ @Override public Object maskRow(Object elem) { try { @@ -31,6 +41,15 @@ public Object maskRow(Object elem) { } } + /** + * Determines whether this hasher can mask the given field. + * + *

Returns {@code true} only for a single-segment path that points to a valid, non-repeated field + * of protobuf integer type. + * + * @param fieldDescriptor the descriptor of the field to be masked + * @return {@code true} if this hasher can process the field, {@code false} otherwise + */ @Override public boolean canProcess(Descriptors.FieldDescriptor fieldDescriptor) { return fieldPath.length == 1 @@ -38,6 +57,12 @@ && isValidNonRepeatedField(fieldDescriptor) && fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.INT; } + /** + * Returns this hasher unchanged, since an integer field has no child to configure. + * + * @param fieldDescriptor the descriptor of the field being masked + * @return this hasher + */ @Override public FieldHasher setChild(Descriptors.FieldDescriptor fieldDescriptor) { return this; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/LongFieldHasher.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/LongFieldHasher.java index b1a137b3b..ce6dd60a0 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/LongFieldHasher.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/LongFieldHasher.java @@ -8,6 +8,9 @@ */ public class LongFieldHasher implements FieldHasher { + /** + * Dot-separated path identifying the field this hasher masks; for a primitive it holds a single segment. + */ private final String[] fieldPath; /** @@ -19,6 +22,13 @@ public LongFieldHasher(String[] fieldPath) { this.fieldPath = fieldPath; } + /** + * Hashes the given long value using SHA-256 and returns the hashed long. + * + * @param elem the long field value to hash + * @return the hashed long value + * @throws RowHashException if the value cannot be hashed as a long + */ @Override public Object maskRow(Object elem) { try { @@ -31,6 +41,15 @@ public Object maskRow(Object elem) { } } + /** + * Determines whether this hasher can mask the given field. + * + *

Returns {@code true} only for a single-segment path that points to a valid, non-repeated field + * of protobuf long type. + * + * @param fieldDescriptor the descriptor of the field to be masked + * @return {@code true} if this hasher can process the field, {@code false} otherwise + */ @Override public boolean canProcess(Descriptors.FieldDescriptor fieldDescriptor) { return fieldPath.length == 1 @@ -38,6 +57,12 @@ && isValidNonRepeatedField(fieldDescriptor) && fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.LONG; } + /** + * Returns this hasher unchanged, since a long field has no child to configure. + * + * @param fieldDescriptor the descriptor of the field being masked + * @return this hasher + */ @Override public FieldHasher setChild(Descriptors.FieldDescriptor fieldDescriptor) { return this; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/RowHasher.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/RowHasher.java index 7e8db616c..8eaec805c 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/RowHasher.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/RowHasher.java @@ -11,8 +11,17 @@ */ public class RowHasher implements FieldHasher { + /** + * Dot-separated path segments from this row down to the nested field that must be masked. + */ private String[] splittedFieldPath; + /** + * Hasher responsible for masking the next segment of the field path within this row. + */ private FieldHasher child; + /** + * Index, within this row, of the field handled by the child hasher. + */ private int childIndex; /** @@ -35,6 +44,14 @@ public RowHasher(int childIndex, FieldHasher child) { this.childIndex = childIndex; } + /** + * Masks the nested field within the given row by delegating to the child hasher. + * + *

Replaces the field at the child index of the row with the value produced by the child hasher. + * + * @param elem the {@link Row} containing the field to mask + * @return the same row with its nested field masked + */ @Override public Object maskRow(Object elem) { Row currentRow = (Row) elem; @@ -42,6 +59,15 @@ public Object maskRow(Object elem) { return currentRow; } + /** + * Determines whether this hasher can descend into the given field. + * + *

Returns {@code true} only for a multi-segment path that points to a valid, non-repeated field + * of protobuf message type. + * + * @param fieldDescriptor the descriptor of the field to be masked + * @return {@code true} if this hasher can process the field, {@code false} otherwise + */ @Override public boolean canProcess(Descriptors.FieldDescriptor fieldDescriptor) { return splittedFieldPath.length > 1 @@ -49,6 +75,12 @@ && isValidNonRepeatedField(fieldDescriptor) && fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.MESSAGE; } + /** + * Lazily creates and stores the child hasher for the next path segment when not already set. + * + * @param fieldDescriptor the descriptor of the message field this row hasher handles + * @return this hasher + */ @Override public FieldHasher setChild(Descriptors.FieldDescriptor fieldDescriptor) { if (child == null) { @@ -57,6 +89,15 @@ public FieldHasher setChild(Descriptors.FieldDescriptor fieldDescriptor) { return this; } + /** + * Builds the child hasher for the remaining field path beneath this row. + * + *

Strips the first path segment, resolves the descriptor of the next field within the message + * type, records its index and creates the appropriate child hasher via {@code FieldHasherFactory}. + * + * @param fieldDescriptor the descriptor of the message field this row hasher handles + * @return the child hasher for the next segment of the field path + */ private FieldHasher createChild(Descriptors.FieldDescriptor fieldDescriptor) { String[] childColumnPath = Arrays.copyOfRange(splittedFieldPath, 1, splittedFieldPath.length); String childField = childColumnPath[0]; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/StringFieldHasher.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/StringFieldHasher.java index f038cd7da..72317c10a 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/StringFieldHasher.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/StringFieldHasher.java @@ -10,6 +10,9 @@ */ public class StringFieldHasher implements FieldHasher { + /** + * Dot-separated path identifying the field this hasher masks; for a primitive it holds a single segment. + */ private final String[] fieldPath; /** @@ -21,6 +24,13 @@ public StringFieldHasher(String[] fieldPath) { this.fieldPath = fieldPath; } + /** + * Hashes the given string value using SHA-256 and returns the hexadecimal hash. + * + * @param elem the string field value to hash + * @return the hashed string value + * @throws RowHashException if the value cannot be hashed as a string + */ @Override public Object maskRow(Object elem) { try { @@ -33,6 +43,15 @@ public Object maskRow(Object elem) { } } + /** + * Determines whether this hasher can mask the given field. + * + *

Returns {@code true} only for a single-segment path that points to a valid, non-repeated field + * of protobuf string type. + * + * @param fieldDescriptor the descriptor of the field to be masked + * @return {@code true} if this hasher can process the field, {@code false} otherwise + */ @Override public boolean canProcess(Descriptors.FieldDescriptor fieldDescriptor) { return fieldPath.length == 1 @@ -40,6 +59,12 @@ && isValidNonRepeatedField(fieldDescriptor) && fieldDescriptor.getJavaType() == Descriptors.FieldDescriptor.JavaType.STRING; } + /** + * Returns this hasher unchanged, since a string field has no child to configure. + * + * @param fieldDescriptor the descriptor of the field being masked + * @return this hasher + */ @Override public FieldHasher setChild(Descriptors.FieldDescriptor fieldDescriptor) { return this; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/UnsupportedDataTypeHasher.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/UnsupportedDataTypeHasher.java index 2e65385ea..95b9d1845 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/UnsupportedDataTypeHasher.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/transformers/hash/field/UnsupportedDataTypeHasher.java @@ -7,6 +7,9 @@ * The Unsupported data type hasher. */ public class UnsupportedDataTypeHasher implements FieldHasher { + /** + * Dot-separated path identifying the field that could not be matched to a supported hasher. + */ private String[] fieldPath; /** @@ -18,16 +21,35 @@ public UnsupportedDataTypeHasher(String[] fieldPath) { this.fieldPath = fieldPath; } + /** + * Returns the given value unchanged, since this fallback hasher performs no masking. + * + * @param elem the field value + * @return the value unchanged + */ @Override public Object maskRow(Object elem) { return elem; } + /** + * Always reports that this fallback hasher cannot process the field. + * + * @param fieldDescriptor the descriptor of the field to be masked + * @return {@code false} always + */ @Override public boolean canProcess(Descriptors.FieldDescriptor fieldDescriptor) { return false; } + /** + * Always fails because the field has no supported primitive type to hash. + * + * @param fieldDescriptor the descriptor of the field being masked + * @return never returns normally + * @throws InvalidHashFieldException always, indicating the field cannot be hashed + */ @Override public FieldHasher setChild(Descriptors.FieldDescriptor fieldDescriptor) { if (fieldPath.length == 0 || fieldDescriptor == null) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/CollectArray.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/CollectArray.java index 8fe7a6c09..4b876f658 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/CollectArray.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/CollectArray.java @@ -15,6 +15,14 @@ @FunctionHint(output = @DataTypeHint(value = "RAW", bridgedTo = ArrayList.class)) public class CollectArray extends AggregateUdf, ArrayAccumulator> { + /** + * Creates a fresh, empty {@link ArrayAccumulator} for a new aggregation group. + * + *

Flink invokes this method once per aggregation key to obtain the mutable state + * into which input objects are folded by {@code accumulate}. + * + * @return a new, empty {@link ArrayAccumulator} instance + */ public ArrayAccumulator createAccumulator() { return new ArrayAccumulator(); } @@ -32,10 +40,29 @@ public void accumulate(ArrayAccumulator arrayAccumulator, @DataTypeHint(inputGro arrayAccumulator.add(obj); } + /** + * Returns the aggregated result by emitting every object collected so far. + * + *

Flink calls this to compute the final output value of the aggregation from the + * supplied accumulator state. + * + * @param arrayAccumulator the accumulator holding the collected objects + * @return the list of all objects accumulated for the current group, in insertion order + */ public List getValue(ArrayAccumulator arrayAccumulator) { return arrayAccumulator.emit(); } + /** + * Merges the objects collected by other accumulators into the target accumulator. + * + *

Flink uses this when partial aggregates computed in parallel (for example across + * session windows or split groups) must be combined; every object from each accumulator + * in {@code it} is appended to {@code arrayAccumulator}. + * + * @param arrayAccumulator the accumulator that receives the merged objects + * @param it the other accumulators whose collected objects are merged in + */ public void merge(ArrayAccumulator arrayAccumulator, Iterable it) { for (ArrayAccumulator accumulatorInstance : it) { arrayAccumulator.getArrayList().addAll(accumulatorInstance.getArrayList()); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/DistinctCount.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/DistinctCount.java index a01e96eb3..90ff51e69 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/DistinctCount.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/DistinctCount.java @@ -8,11 +8,27 @@ */ public class DistinctCount extends AggregateUdf { + /** + * Creates a fresh, empty {@link DistinctCountAccumulator} for a new aggregation group. + * + *

Flink invokes this once per aggregation key to obtain the mutable state used to + * track the set of distinct values seen for that group. + * + * @return a new, empty {@link DistinctCountAccumulator} instance + */ @Override public DistinctCountAccumulator createAccumulator() { return new DistinctCountAccumulator(); } + /** + * Returns the number of distinct values accumulated so far. + * + *

Flink calls this to compute the final aggregation output from the accumulator state. + * + * @param distinctCountAccumulator the accumulator holding the distinct values + * @return the count of distinct items recorded for the current group + */ @Override public Integer getValue(DistinctCountAccumulator distinctCountAccumulator) { return distinctCountAccumulator.count(); @@ -32,6 +48,16 @@ public void accumulate(DistinctCountAccumulator distinctCountAccumulator, String distinctCountAccumulator.add(item); } + /** + * Merges the distinct values from other accumulators into the target accumulator. + * + *

Each {@link DistinctCountAccumulator} in {@code it} contributes its recorded items, + * which are added to {@code distinctCountAccumulator}; duplicates are naturally removed + * because the underlying storage is a set. + * + * @param distinctCountAccumulator the accumulator that receives the merged distinct values + * @param it the other accumulators whose distinct values are merged in + */ public void merge(DistinctCountAccumulator distinctCountAccumulator, Iterable it) { for (DistinctCountAccumulator distinctCountAccumulatorInstance : it) { distinctCountAccumulator.getDistinctItems().addAll(distinctCountAccumulatorInstance.getDistinctItems()); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/Features.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/Features.java index 3fadb57f4..17c1a546c 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/Features.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/Features.java @@ -14,11 +14,28 @@ @FunctionHint(output = @DataTypeHint("RAW")) public class Features extends AggregateUdf { + /** + * Creates a fresh, empty {@link FeatureAccumulator} for a new aggregation group. + * + *

Flink invokes this once per aggregation key to obtain the mutable state used to + * collect key/value feature pairs. + * + * @return a new, empty {@link FeatureAccumulator} instance + */ @Override public FeatureAccumulator createAccumulator() { return new FeatureAccumulator(); } + /** + * Returns the accumulated features as an array of Feast feature {@link Row} values. + * + *

Flink calls this to produce the final aggregation output, converting every collected + * key/value pair into its {@link Row} representation. + * + * @param featureAccumulator the accumulator holding the collected feature pairs + * @return an array of {@link Row} values, one per accumulated feature + */ @Override public Row[] getValue(FeatureAccumulator featureAccumulator) { return featureAccumulator.getFeaturesAsRows(); @@ -43,6 +60,15 @@ public void accumulate(FeatureAccumulator featureAccumulator, @DataTypeHint(inpu } } + /** + * Merges the features collected by other accumulators into the target accumulator. + * + *

Every feature pair from each {@link FeatureAccumulator} in {@code it} is appended to + * {@code featureAccumulator}, combining partial aggregates produced in parallel. + * + * @param featureAccumulator the accumulator that receives the merged features + * @param it the other accumulators whose features are merged in + */ public void merge(FeatureAccumulator featureAccumulator, Iterable it) { for (FeatureAccumulator accumulatorInstance : it) { featureAccumulator.getFeatures().addAll(accumulatorInstance.getFeatures()); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/FeaturesWithType.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/FeaturesWithType.java index 8894dcf8e..adfc597c0 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/FeaturesWithType.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/FeaturesWithType.java @@ -16,11 +16,28 @@ @FunctionHint(output = @DataTypeHint("RAW")) public class FeaturesWithType extends AggregateUdf { + /** + * Creates a fresh, empty {@link FeatureWithTypeAccumulator} for a new aggregation group. + * + *

Flink invokes this once per aggregation key to obtain the mutable state used to + * collect typed key/value feature triplets. + * + * @return a new, empty {@link FeatureWithTypeAccumulator} instance + */ @Override public FeatureWithTypeAccumulator createAccumulator() { return new FeatureWithTypeAccumulator(); } + /** + * Returns the accumulated typed features as an array of Feast feature {@link Row} values. + * + *

Flink calls this to produce the final aggregation output, converting every collected + * typed feature triplet into its {@link Row} representation. + * + * @param featureAccumulator the accumulator holding the collected typed features + * @return an array of {@link Row} values, one per accumulated feature + */ @Override public Row[] getValue(FeatureWithTypeAccumulator featureAccumulator) { return featureAccumulator.getFeaturesAsRows(); @@ -56,12 +73,33 @@ public void retract(FeatureWithTypeAccumulator featureAccumulator, Object... obj } } + /** + * Merges the typed features collected by other accumulators into the target accumulator. + * + *

Every feature triplet from each {@link FeatureWithTypeAccumulator} in {@code it} is + * re-added to {@code featureWithTypeAccumulator}, combining partial aggregates produced in + * parallel while preserving the de-duplication keyed on feature name and value. + * + * @param featureWithTypeAccumulator the accumulator that receives the merged features + * @param it the other accumulators whose features are merged in + */ public void merge(FeatureWithTypeAccumulator featureWithTypeAccumulator, Iterable it) { for (FeatureWithTypeAccumulator accumulatorInstance : it) { accumulatorInstance.getFeatures().forEach((s, tuple3) -> featureWithTypeAccumulator.add(tuple3.f0, tuple3.f1, tuple3.f2)); } } + /** + * Validates that the supplied arguments form complete feature triplets. + * + *

Each feature requires a fixed number of arguments + * ({@link Constants#NUMBER_OF_ARGUMENTS_IN_FEATURE_ACCUMULATOR}: name, value and type), so the + * total number of arguments must be an exact multiple of that group size. + * + * @param objects the raw arguments passed to {@code accumulate} or {@code retract} + * @throws InvalidNumberOfArgumentsException if the number of arguments is not a multiple of the + * required triplet size + */ private void validate(Object[] objects) { if (objects.length % Constants.NUMBER_OF_ARGUMENTS_IN_FEATURE_ACCUMULATOR != 0) { throw new InvalidNumberOfArgumentsException(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/PercentileAggregator.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/PercentileAggregator.java index 2bf1058e6..1c99d872c 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/PercentileAggregator.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/PercentileAggregator.java @@ -16,11 +16,27 @@ */ public class PercentileAggregator extends AggregateUdf { + /** + * Creates a fresh, empty {@link PercentileAccumulator} for a new aggregation group. + * + *

Flink invokes this once per aggregation key to obtain the mutable state used to + * collect the sample values and the requested percentile. + * + * @return a new, empty {@link PercentileAccumulator} instance + */ @Override public PercentileAccumulator createAccumulator() { return new PercentileAccumulator(); } + /** + * Returns the computed percentile over all values accumulated so far. + * + *

Flink calls this to produce the final aggregation output from the accumulator state. + * + * @param acc the accumulator holding the sample values and the requested percentile + * @return the percentile value computed from the accumulated samples + */ @Override public Double getValue(PercentileAccumulator acc) { return acc.getPercentileValue(); @@ -37,6 +53,16 @@ public void accumulate(PercentileAccumulator acc, @DataTypeHint("DECIMAL(30, 3)" acc.add(percentile.doubleValue(), dValue.doubleValue()); } + /** + * Merges the sample values from other accumulators into the target accumulator. + * + *

The double values held by each {@link PercentileAccumulator} in {@code otherAccumulators} + * are appended to {@code percentileAccumulator}, and the requested percentile is carried over so + * the combined accumulator can compute the percentile across all partial samples. + * + * @param percentileAccumulator the accumulator that receives the merged sample values + * @param otherAccumulators the other accumulators whose sample values are merged in + */ public void merge(PercentileAccumulator percentileAccumulator, Iterable otherAccumulators) { for (PercentileAccumulator accumulatorInstance : otherAccumulators) { percentileAccumulator.getdValueList().addAll(accumulatorInstance.getdValueList()); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/ArrayAccumulator.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/ArrayAccumulator.java index 8c64b717d..3de7927c7 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/ArrayAccumulator.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/ArrayAccumulator.java @@ -11,6 +11,9 @@ */ public class ArrayAccumulator implements Serializable { + /** + * Backing list that stores every object collected by the owning aggregation. + */ private @DataTypeHint("RAW") List arrayList = new ArrayList<>(); /** @@ -31,10 +34,22 @@ public List emit() { return arrayList; } + /** + * Returns the backing list of accumulated objects. + * + *

Primarily intended for state access during {@code merge} and for serialization. + * + * @return the mutable list of collected objects + */ public List getArrayList() { return arrayList; } + /** + * Replaces the backing list of accumulated objects. + * + * @param arrayList the list of objects to use as the accumulator state + */ public void setArrayList(List arrayList) { this.arrayList = arrayList; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/DistinctCountAccumulator.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/DistinctCountAccumulator.java index ac5a0e72d..e42b0ecdc 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/DistinctCountAccumulator.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/DistinctCountAccumulator.java @@ -10,6 +10,9 @@ */ public class DistinctCountAccumulator implements Serializable { + /** + * Backing set holding the distinct values observed by the owning aggregation. + */ private @DataTypeHint("RAW") HashSet distinctItems = new HashSet<>(); /** @@ -30,10 +33,22 @@ public void add(String item) { distinctItems.add(item); } + /** + * Returns the backing set of distinct values. + * + *

Primarily intended for state access during {@code merge} and for serialization. + * + * @return the mutable set of distinct items + */ public HashSet getDistinctItems() { return distinctItems; } + /** + * Replaces the backing set of distinct values. + * + * @param distinctItems the set of distinct items to use as the accumulator state + */ public void setDistinctItems(HashSet distinctItems) { this.distinctItems = distinctItems; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureAccumulator.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureAccumulator.java index 5ff15cf85..a0013f16f 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureAccumulator.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureAccumulator.java @@ -14,7 +14,13 @@ */ public class FeatureAccumulator implements Serializable { + /** + * Backing list of key/value feature pairs collected by the owning aggregation. + */ private @DataTypeHint("RAW") List> features = new ArrayList<>(); + /** + * Fixed number of fields used to build each emitted Feast feature {@link Row}. + */ private static final Integer FEATURE_ROW_LENGTH = 3; /** @@ -42,10 +48,22 @@ public Row[] getFeaturesAsRows() { return featureRows.toArray(new Row[0]); } + /** + * Returns the backing list of collected feature pairs. + * + *

Primarily intended for state access during {@code merge} and for serialization. + * + * @return the mutable list of key/value feature pairs + */ public List> getFeatures() { return features; } + /** + * Replaces the backing list of collected feature pairs. + * + * @param features the list of key/value feature pairs to use as the accumulator state + */ public void setFeatures(List> features) { this.features = features; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureWithTypeAccumulator.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureWithTypeAccumulator.java index 59fa55d1f..d21942b18 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureWithTypeAccumulator.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/FeatureWithTypeAccumulator.java @@ -14,8 +14,14 @@ * The accumulator for FeatureWithType udf. */ public class FeatureWithTypeAccumulator implements Serializable { + /** + * Fixed number of fields used to build each emitted Feast feature {@link Row}. + */ private static final Integer FEATURE_ROW_LENGTH = 3; + /** + * Backing map of de-duplicated typed feature triplets, keyed by feature name and hash. + */ private @DataTypeHint("RAW") HashMap> features = new HashMap<>(); /** @@ -58,14 +64,36 @@ public void remove(String key, Object value, ValueEnum type) { features.remove(getMapKey(key, featureTuple.hashCode())); } + /** + * Returns the backing map of collected typed features. + * + *

Primarily intended for state access during {@code merge} and for serialization. + * + * @return the mutable map of de-duplicated typed feature triplets + */ public HashMap> getFeatures() { return features; } + /** + * Replaces the backing map of collected typed features. + * + * @param features the map of typed feature triplets to use as the accumulator state + */ public void setFeatures(HashMap> features) { this.features = features; } + /** + * Builds the map key used to de-duplicate a feature within the accumulator. + * + *

The key combines the feature name with the hash code of its full triplet so that + * features differing in value or type are stored under distinct keys. + * + * @param key the feature name + * @param hashcode the hash code of the feature triplet + * @return the composite map key in the form {@code name-hashcode} + */ private String getMapKey(String key, Integer hashcode) { return String.format("%s-%d", key, hashcode); } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/PercentileAccumulator.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/PercentileAccumulator.java index f82eb2272..64555a20f 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/PercentileAccumulator.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/accumulator/PercentileAccumulator.java @@ -11,8 +11,14 @@ */ public class PercentileAccumulator implements Serializable { + /** + * Backing list of sample values over which the percentile is computed. + */ private List dValueList = new ArrayList<>(); + /** + * The requested percentile (for example {@code 95.0}) to evaluate over the samples. + */ private double percentile; /** @@ -36,18 +42,40 @@ public double getPercentileValue() { evaluate(dValueList.stream().sorted().mapToDouble(Double::doubleValue).toArray(), 0, dValueList.size()); } + /** + * Returns the backing list of sample values. + * + *

Primarily intended for state access during {@code merge} and for serialization. + * + * @return the mutable list of sample values + */ public List getdValueList() { return dValueList; } + /** + * Replaces the backing list of sample values. + * + * @param dValueList the list of sample values to use as the accumulator state + */ public void setdValueList(List dValueList) { this.dValueList = dValueList; } + /** + * Returns the requested percentile to be evaluated over the samples. + * + * @return the percentile value + */ public double getPercentile() { return percentile; } + /** + * Sets the requested percentile to be evaluated over the samples. + * + * @param percentile the percentile value to evaluate + */ public void setPercentile(double percentile) { this.percentile = percentile; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BigDecimalValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BigDecimalValueTransformer.java index 770e36750..445fb14bb 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BigDecimalValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BigDecimalValueTransformer.java @@ -8,21 +8,46 @@ * The Big decimal value transformer. */ public class BigDecimalValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link BigDecimal}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof BigDecimal; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code value} is a {@link BigDecimal} and {@code targetType} is + * {@link ValueEnum#DoubleType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return value instanceof BigDecimal && targetType == DoubleType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#DoubleType}, as a {@link BigDecimal} maps to a Feast double value + */ @Override public Integer getIndex() { return DoubleType.getValue(); } + /** + * Converts the given {@link BigDecimal} into the {@code double} held in the Feast double value slot. + * + * @param value the {@link BigDecimal} to convert; may be {@code null} + * @return the {@code double} value of {@code value}, or {@code 0.0} when {@code value} is {@code null} + */ @Override public Object getValue(Object value) { if (value == null) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BooleanValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BooleanValueTransformer.java index 543ce35ad..a9c4b6d4d 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BooleanValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/BooleanValueTransformer.java @@ -6,21 +6,45 @@ * The Boolean value transformer. */ public class BooleanValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link Boolean}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof Boolean; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code targetType} is {@link ValueEnum#BooleanType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return targetType == BooleanType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#BooleanType}, as a {@link Boolean} maps to a Feast boolean value + */ @Override public Integer getIndex() { return BooleanType.getValue(); } + /** + * Converts the given value into the {@link Boolean} held in the Feast boolean value slot. + * + * @param value the value to convert; may be {@code null} + * @return {@code value} when it is non-{@code null}, otherwise {@code false} + */ @Override public Object getValue(Object value) { return value != null ? value : false; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ByteValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ByteValueTransformer.java index d660d32ba..5af59f53e 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ByteValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ByteValueTransformer.java @@ -8,16 +8,34 @@ * The Byte value transformer. */ public class ByteValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link ByteString}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof ByteString; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code targetType} is {@link ValueEnum#ByteType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return targetType == ByteType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#ByteType}, as a {@link ByteString} maps to a Feast bytes value + */ @Override public Integer getIndex() { return ByteType.getValue(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/DoubleValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/DoubleValueTransformer.java index 49d998b7e..ea79b776a 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/DoubleValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/DoubleValueTransformer.java @@ -7,21 +7,45 @@ */ public class DoubleValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link Double}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof Double; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code targetType} is {@link ValueEnum#DoubleType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return targetType == DoubleType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#DoubleType}, as a {@link Double} maps to a Feast double value + */ @Override public Integer getIndex() { return DoubleType.getValue(); } + /** + * Converts the given value into the {@code double} held in the Feast double value slot. + * + * @param value the value to convert; may be {@code null} + * @return the parsed {@code double} of {@code value}, or {@code 0.0} when {@code value} is {@code null} + */ @Override public Object getValue(Object value) { return value != null ? Double.valueOf(value.toString()) : 0.0d; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/FloatValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/FloatValueTransformer.java index 76da11c4a..3ad1f36ba 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/FloatValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/FloatValueTransformer.java @@ -6,21 +6,45 @@ * The Float value transformer. */ public class FloatValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link Float}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof Float; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code targetType} is {@link ValueEnum#FloatType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return targetType == FloatType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#FloatType}, as a {@link Float} maps to a Feast float value + */ @Override public Integer getIndex() { return FloatType.getValue(); } + /** + * Converts the given value into the {@code float} held in the Feast float value slot. + * + * @param value the value to convert; may be {@code null} + * @return the parsed {@code float} of {@code value}, or {@code 0.0} when {@code value} is {@code null} + */ @Override public Object getValue(Object value) { return value != null ? Float.valueOf(value.toString()) : 0.0f; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/IntegerValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/IntegerValueTransformer.java index 9af0022f5..c80166540 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/IntegerValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/IntegerValueTransformer.java @@ -8,21 +8,45 @@ public class IntegerValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is an {@link Integer}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof Integer; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code targetType} is {@link ValueEnum#IntegerType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return targetType == IntegerType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#IntegerType}, as an {@link Integer} maps to a Feast integer value + */ @Override public Integer getIndex() { return IntegerType.getValue(); } + /** + * Converts the given value into the {@code int} held in the Feast integer value slot. + * + * @param value the value to convert; may be {@code null} + * @return the parsed {@code int} of {@code value}, or {@code 0} when {@code value} is {@code null} + */ @Override public Object getValue(Object value) { return value != null ? Integer.valueOf(value.toString()) : 0; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/LongValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/LongValueTransformer.java index 78fafa53b..814ecefe3 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/LongValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/LongValueTransformer.java @@ -7,21 +7,45 @@ */ public class LongValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link Long}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof Long; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code targetType} is {@link ValueEnum#LongType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return targetType == LongType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#LongType}, as a {@link Long} maps to a Feast long value + */ @Override public Integer getIndex() { return LongType.getValue(); } + /** + * Converts the given value into the {@code long} held in the Feast long value slot. + * + * @param value the value to convert; may be {@code null} + * @return the parsed {@code long} of {@code value}, or {@code 0L} when {@code value} is {@code null} + */ @Override public Object getValue(Object value) { return value != null ? Long.valueOf(value.toString()) : 0L; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/NullValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/NullValueTransformer.java index 80d07efc2..2d87195ef 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/NullValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/NullValueTransformer.java @@ -9,21 +9,46 @@ */ public class NullValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is {@code null}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return null == value; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type, ignored for {@code null} values + * @return {@code true} when {@code value} is {@code null}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return null == value; } + /** + * Reports that no Feast value slot index applies to a {@code null} value. + * + * @return never returns normally + * @throws NotImplementedException always, since a {@code null} value has no Feast value slot index + */ @Override public Integer getIndex() { throw new NotImplementedException("Index for Null Value shouldn't be used"); } + /** + * Builds an empty Feast value row for a {@code null} value, leaving every value slot unset. + * + * @param value the value to transform; expected to be {@code null} + * @return a new empty {@link Row} sized to {@link Constants#NUMBER_OF_DATA_TYPES_IN_FEATURE_ROW} + */ @Override public Row transform(Object value) { return new Row(Constants.NUMBER_OF_DATA_TYPES_IN_FEATURE_ROW); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/StringValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/StringValueTransformer.java index 987f61c0a..a3af6a897 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/StringValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/StringValueTransformer.java @@ -7,21 +7,45 @@ */ public class StringValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link String}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof String; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code targetType} is {@link ValueEnum#StringType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return targetType == StringType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#StringType}, as a {@link String} maps to a Feast string value + */ @Override public Integer getIndex() { return StringType.getValue(); } + /** + * Converts the given value into the {@link String} held in the Feast string value slot. + * + * @param value the value to convert; expected to be non-{@code null} + * @return the {@link String} form of {@code value} produced by its {@code toString()} method + */ @Override public Object getValue(Object value) { return value.toString(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/TimestampValueTransformer.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/TimestampValueTransformer.java index c7324d3c5..ec3aaa505 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/TimestampValueTransformer.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/TimestampValueTransformer.java @@ -9,16 +9,35 @@ */ public class TimestampValueTransformer implements ValueTransformer { + /** + * Determines whether this transformer can convert the given value into a Feast value. + * + * @param value the candidate value to inspect + * @return {@code true} when {@code value} is a {@link Row} with an arity of {@code 2}, otherwise {@code false} + */ @Override public boolean canTransform(Object value) { return value instanceof Row && ((Row) value).getArity() == 2; } + /** + * Determines whether this transformer can convert the given value into the requested Feast target type. + * + * @param value the candidate value to inspect + * @param targetType the desired Feast value type + * @return {@code true} when {@code value} is a {@link Row} with an arity of {@code 2} and {@code targetType} + * is {@link ValueEnum#TimestampType}, otherwise {@code false} + */ @Override public boolean canTransformWithTargetType(Object value, ValueEnum targetType) { return value instanceof Row && ((Row) value).getArity() == 2 && targetType == ValueEnum.TimestampType; } + /** + * Returns the Feast value slot index where the converted value is placed in the feature row. + * + * @return the index of {@link ValueEnum#TimestampType}, as a two-field {@link Row} maps to a Feast timestamp value + */ @Override public Integer getIndex() { return TimestampType.getValue(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ValueEnum.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ValueEnum.java index 82f648948..45859b881 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ValueEnum.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/aggregate/feast/handler/ValueEnum.java @@ -37,8 +37,16 @@ public enum ValueEnum { */ TimestampType(7); + /** + * The Feast value slot index associated with this type. + */ private Integer value; + /** + * Creates an enum constant bound to its Feast value slot index. + * + * @param value the Feast value slot index this type maps to + */ ValueEnum(Integer value) { this.value = value; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/factories/FunctionFactory.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/factories/FunctionFactory.java index fdd09a461..d599fcd62 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/factories/FunctionFactory.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/factories/FunctionFactory.java @@ -63,8 +63,14 @@ */ public class FunctionFactory extends UdfFactory { + /** + * Shared {@link Gson} instance used to parse the JSON stream configuration into objects. + */ private static final Gson GSON = new Gson(); + /** + * Orchestrator that supplies Stencil-backed Protobuf descriptors to the UDFs that need schema awareness. + */ private final StencilClientOrchestrator stencilClientOrchestrator; @@ -79,6 +85,16 @@ public FunctionFactory(StreamTableEnvironment streamTableEnvironment, Configurat stencilClientOrchestrator = new StencilClientOrchestrator(configuration); } + /** + * {@inheritDoc} + * + *

Builds the full set of scalar UDFs provided by Dagger functions, covering DART lookups, + * geospatial helpers, time helpers, JSON manipulation, array helpers and feature builders. UDFs + * that require schema awareness or external state are wired with the + * {@link StencilClientOrchestrator} or DART data store they depend on. + * + * @return a {@code HashSet} containing one instance of every registered scalar UDF + */ @Override public HashSet getScalarUdfs() { HashSet scalarUdfs = new HashSet<>(); @@ -115,6 +131,14 @@ public HashSet getScalarUdfs() { return scalarUdfs; } + /** + * {@inheritDoc} + * + *

Builds the set of table UDFs provided by Dagger functions, namely {@link HistogramBucket} + * and {@link OutlierMad}, each of which can emit multiple rows per input row. + * + * @return a {@code HashSet} containing one instance of every registered table UDF + */ @Override public HashSet getTableUdfs() { HashSet tableUdfs = new HashSet<>(); @@ -123,6 +147,14 @@ public HashSet getTableUdfs() { return tableUdfs; } + /** + * {@inheritDoc} + * + *

Builds the set of aggregate UDFs provided by Dagger functions, such as array collection, + * distinct counting, feature accumulation and percentile aggregation. + * + * @return a {@code HashSet} containing one instance of every registered aggregate UDF + */ @Override public HashSet getAggregateUdfs() { HashSet aggregateUdfs = new HashSet<>(); @@ -134,6 +166,15 @@ public HashSet getAggregateUdfs() { return aggregateUdfs; } + /** + * Builds the DART data store backing the {@code DartGet} and {@code DartContains} UDFs. + * + *

The configured store provider, project id and bucket id are read from the job + * {@link Configuration}. When no explicit provider is configured this falls back to the + * GCS-backed store using the GCS-specific project and bucket defaults. + * + * @return a fully configured {@link DartDataStore} for serving DART lookups + */ private DartDataStore getDartDataSource() { String projectID = getConfiguration().getString(Constants.UDF_DART_PROJECT_ID_KEY, Constants.UDF_DART_PROJECT_ID_DEFAULT); String bucketID = getConfiguration().getString(Constants.UDF_DART_BUCKET_ID_KEY, Constants.UDF_DART_BUCKET_ID_DEFAULT); @@ -148,6 +189,16 @@ private DartDataStore getDartDataSource() { return new DefaultDartDataStore(new DartDataStoreClientProvider(udfStoreProvider, projectID, getConfiguration()), bucketID, getConfiguration()); } + /** + * Extracts the mapping of input table name to its Protobuf message class from the job configuration. + * + *

The {@code INPUT_STREAMS} configuration value is parsed as a JSON array of stream + * definitions, and for each stream the input schema table name is associated with its configured + * Protobuf class name. + * + * @return an ordered {@code LinkedHashMap} keyed by table name with the Protobuf + * class name as value + */ private LinkedHashMap getProtosInInputStreams() { LinkedHashMap protoClassForTable = new LinkedHashMap<>(); String jsonArrayString = getConfiguration().getString(INPUT_STREAMS, ""); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfConfig.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfConfig.java index 61205aaff..ebfc5b7d6 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfConfig.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfConfig.java @@ -11,27 +11,61 @@ * The type Python udf config. */ public class PythonUdfConfig { + /** + * Shared, pre-configured Gson instance used to deserialize the raw Python UDF + * configuration JSON into a {@link PythonUdfConfig} instance. + * + *

Complex map key serialization and pretty printing are enabled so the parser + * can faithfully reconstruct the configured values from their JSON representation. + */ private static final Gson GSON = new GsonBuilder() .enableComplexMapKeySerialization() .setPrettyPrinting() .create(); + /** + * Comma-separated list of Python source files that contain the UDFs to register, + * bound to the {@code PYTHON_FILES_KEY} JSON property. + */ @SerializedName(Constants.PYTHON_FILES_KEY) private String pythonFiles; + /** + * Path to the {@code requirements.txt} listing third-party Python dependencies, + * bound to the {@code PYTHON_REQUIREMENTS_KEY} JSON property. + */ @SerializedName(Constants.PYTHON_REQUIREMENTS_KEY) @Getter private String pythonRequirements; + /** + * Comma-separated list of archive files (such as packaged virtual environments) + * to extract for the Python workers, bound to the {@code PYTHON_ARCHIVES_KEY} JSON property. + */ @SerializedName(Constants.PYTHON_ARCHIVES_KEY) private String pythonArchives; + /** + * Maximum number of records transferred to Python workers per Arrow batch, bound to + * the {@code PYTHON_FN_EXECUTION_ARROW_BATCH_SIZE_KEY} JSON property; {@code null} + * selects the configured default. + */ @SerializedName(Constants.PYTHON_FN_EXECUTION_ARROW_BATCH_SIZE_KEY) private Integer pythonArrowBatchSize; + /** + * Maximum number of elements processed per bundle by the Python function runner, bound + * to the {@code PYTHON_FN_EXECUTION_BUNDLE_SIZE_KEY} JSON property; {@code null} selects + * the configured default. + */ @SerializedName(Constants.PYTHON_FN_EXECUTION_BUNDLE_SIZE_KEY) private Integer pythonBundleSize; + /** + * Maximum time, in milliseconds, that a bundle may be buffered before dispatch to the + * Python workers, bound to the {@code PYTHON_FN_EXECUTION_BUNDLE_TIME_KEY} JSON property; + * {@code null} selects the configured default. + */ @SerializedName(Constants.PYTHON_FN_EXECUTION_BUNDLE_TIME_KEY) private Long pythonBundleTime; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfManager.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfManager.java index bbd44dae7..0207ea716 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfManager.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/PythonUdfManager.java @@ -15,8 +15,20 @@ */ public class PythonUdfManager { + /** + * Flink table environment into which the discovered Python UDFs are registered as + * temporary SQL functions. + */ private StreamTableEnvironment tableEnvironment; + /** + * Parsed Python UDF configuration describing the files, requirements, archives and + * execution tuning to apply. + */ private PythonUdfConfig pythonUdfConfig; + /** + * Dagger configuration used to resolve the appropriate file source (local, GCS, OSS or + * COS) for each configured Python file. + */ private final Configuration configuration; /** @@ -52,6 +64,14 @@ public void registerPythonFunctions() throws IOException { } } + /** + * Applies the Python execution settings from the {@link PythonUdfConfig} onto the + * underlying Flink table environment configuration. + * + *

Optional requirements and archives are only set when present, while the Python + * files, Arrow batch size, bundle size and bundle time are always configured using the + * values (or defaults) resolved from the config. + */ private void registerPythonConfig() { if (pythonUdfConfig.getPythonRequirements() != null) { tableEnvironment.getConfig().getConfiguration().setString("python.requirements", pythonUdfConfig.getPythonRequirements()); @@ -65,12 +85,30 @@ private void registerPythonConfig() { tableEnvironment.getConfig().getConfiguration().setLong("python.fn-execution.bundle.time", pythonUdfConfig.getPythonBundleTime()); } + /** + * Executes each of the supplied SQL statements against the table environment. + * + * @param sqlQueries the SQL statements to run, typically temporary function + * registrations derived from the discovered Python files + */ private void executeSql(List sqlQueries) { for (String query : sqlQueries) { tableEnvironment.executeSql(query); } } + /** + * Builds the {@code CREATE TEMPORARY FUNCTION} SQL statements that register each + * discovered Python file as a Flink SQL function. + * + *

For every entry the {@code .py} suffix is stripped and path separators are + * converted to dots to form the fully-qualified Python callable; the derived function + * name (the last path segment, upper-cased) is bound to that callable using the + * {@code PYTHON} language. + * + * @param fileNames the Python file names discovered for a configured source + * @return one SQL registration statement per supplied file name + */ private List createQuery(List fileNames) { List sqlQueries = new ArrayList<>(); for (String fileName : fileNames) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/FileSourceFactory.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/FileSourceFactory.java index eea9f25e0..873d70a8c 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/FileSourceFactory.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/FileSourceFactory.java @@ -37,6 +37,16 @@ public static FileSource getFileSource(String pythonFile, Configuration configur } } + /** + * Extracts the upper-cased scheme prefix from a configured Python file location. + * + *

The portion preceding {@code ://} is returned, so {@code gs://bucket/file.py} + * yields {@code GS}; a location without a scheme separator yields the whole string + * upper-cased, which is treated as a local file path. + * + * @param pythonFile the configured Python file location + * @return the upper-cased scheme prefix used to select the matching file source + */ private static String getFileSourcePrefix(String pythonFile) { String[] files = pythonFile.split("://"); return files[0].toUpperCase(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileClient.java index 6ed4a9d07..2bc2863de 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileClient.java @@ -11,11 +11,31 @@ import java.util.List; import java.util.stream.Collectors; +/** + * Tencent Cloud Object Storage (COS) client used to download Python UDF artifacts. + * + *

Wraps a {@code COSClient} obtained from {@code CosLibClient}, resolving the bucket + * name and object key from a {@code cosn://} location and reading the object content into + * an in-memory byte array. + */ public class CosFileClient { + /** + * Whether to authenticate using the Tencent Kubernetes Engine (TKE) OIDC provider when + * constructing the underlying COS client. + */ private final boolean enableTkeOidcProvider; + /** + * Tencent Cloud region in which the target COS bucket resides. + */ private final String cosRegion; + /** + * Creates a COS file client bound to the given region and authentication mode. + * + * @param enableTkeOidcProvider whether to authenticate via the TKE OIDC provider + * @param cosRegion the Tencent Cloud region of the target bucket + */ public CosFileClient(boolean enableTkeOidcProvider, String cosRegion) { this.enableTkeOidcProvider = enableTkeOidcProvider; this.cosRegion = cosRegion; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileSource.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileSource.java index cbefd93b9..84db27fba 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileSource.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/cos/CosFileSource.java @@ -4,11 +4,30 @@ import java.io.IOException; +/** + * {@link FileSource} implementation that reads a Python UDF artifact from Tencent Cloud + * Object Storage (COS). + * + *

Delegates the actual download to a lazily-created {@link CosFileClient}, allowing a + * pre-built client to be injected for testing. + */ public class CosFileSource implements FileSource { + /** + * COS client used to fetch the object; created lazily on first use unless injected. + */ private CosFileClient cosFileClient; + /** + * The {@code cosn://} location of the Python file to download. + */ private final String pythonFile; + /** + * Tencent Cloud region of the COS bucket holding the file. + */ private final String cosRegion; + /** + * Whether the COS client should authenticate via the TKE OIDC provider. + */ private final boolean enableTkeOidcProvider; /** @@ -35,6 +54,15 @@ public CosFileSource(String pythonFile, CosFileClient cosFileClient, boolean ena this.enableTkeOidcProvider = enableTkeOidcProvider; } + /** + * {@inheritDoc} + * + *

Lazily obtains a {@link CosFileClient} and downloads the configured COS object, + * returning its raw bytes. + * + * @return the file content downloaded from COS + * @throws IOException if the object cannot be read from COS + */ @Override public byte[] getObjectFile() throws IOException { return getCosClient().getFile(pythonFile); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsClient.java index a3b85873a..ba7ef5513 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsClient.java @@ -14,6 +14,10 @@ */ public class GcsClient { + /** + * Google Cloud Storage service handle used to fetch blobs; built from the default + * {@code StorageOptions} or injected for testing. + */ private Storage storage; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsFileSource.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsFileSource.java index 2eba02fea..f5456e313 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsFileSource.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/gcs/GcsFileSource.java @@ -8,7 +8,13 @@ */ public class GcsFileSource implements FileSource { + /** + * GCS client used to fetch the object; created lazily on first use unless injected. + */ private GcsClient gcsClient; + /** + * The {@code gs://} location of the Python file to download. + */ private String pythonFile; /** @@ -32,6 +38,14 @@ public GcsFileSource(String pythonFile, GcsClient gcsClient) { this.gcsClient = gcsClient; } + /** + * {@inheritDoc} + * + *

Lazily obtains a {@link GcsClient} and downloads the configured GCS object, + * returning its raw bytes. + * + * @return the file content downloaded from Google Cloud Storage + */ @Override public byte[] getObjectFile() { return getGcsClient().getFile(pythonFile); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/local/LocalFileSource.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/local/LocalFileSource.java index 649eda21c..d2f8ed60b 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/local/LocalFileSource.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/local/LocalFileSource.java @@ -11,6 +11,9 @@ */ public class LocalFileSource implements FileSource { + /** + * Absolute or relative path on the local file system to the Python file to read. + */ private String pythonFile; /** @@ -22,6 +25,14 @@ public LocalFileSource(String pythonFile) { this.pythonFile = pythonFile; } + /** + * {@inheritDoc} + * + *

Reads the configured local file in full and returns its raw bytes. + * + * @return the content of the local Python file + * @throws IOException if the file cannot be read + */ @Override public byte[] getObjectFile() throws IOException { return Files.readAllBytes(Paths.get(pythonFile)); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssClient.java index 9a89f6c4f..0464700c0 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssClient.java @@ -13,7 +13,17 @@ import java.util.List; import java.util.stream.Collectors; +/** + * Alibaba Cloud Object Storage Service (OSS) client used to download Python UDF artifacts. + * + *

Wraps an {@code OSS} client built against a configured endpoint, resolving the bucket + * name and object key from an {@code oss://} location and reading the object content into + * an in-memory byte array. + */ public class OssClient { + /** + * Underlying Alibaba Cloud OSS client used to fetch objects. + */ private final OSS libOssClient; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssFileSource.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssFileSource.java index 4950dcb03..2705e32bb 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssFileSource.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/source/oss/OssFileSource.java @@ -4,9 +4,25 @@ import java.io.IOException; +/** + * {@link FileSource} implementation that reads a Python UDF artifact from Alibaba Cloud + * Object Storage Service (OSS). + * + *

Delegates the actual download to a lazily-created {@link OssClient}, allowing a + * pre-built client to be injected for testing. + */ public class OssFileSource implements FileSource { + /** + * OSS client used to fetch the object; created lazily on first use unless injected. + */ private OssClient ossClient; + /** + * The {@code oss://} location of the Python file to download. + */ private final String pythonFile; + /** + * OSS endpoint used to construct the client when one is not injected. + */ private final String ossEndpoint; /** @@ -31,6 +47,15 @@ public OssFileSource(String pythonFile, OssClient ossClient, String ossEndpoint) this.ossEndpoint = ossEndpoint; } + /** + * {@inheritDoc} + * + *

Lazily obtains an {@link OssClient} and downloads the configured OSS object, + * returning its raw bytes. + * + * @return the file content downloaded from OSS + * @throws IOException if the object cannot be read from OSS + */ @Override public byte[] getObjectFile() throws IOException { return getOssClient().getFile(pythonFile); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/FileTypeFactory.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/FileTypeFactory.java index e3b73c693..760b57c24 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/FileTypeFactory.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/FileTypeFactory.java @@ -28,6 +28,15 @@ public static FileType getFileType(String pythonFile, Configuration configuratio } } + /** + * Determines the file format of a Python file from its extension. + * + *

The substring after the final {@code .} is returned upper-cased, for example + * {@code PY} or {@code ZIP}, and is used to select the matching {@link FileType}. + * + * @param pythonFile the configured Python file location + * @return the upper-cased file extension + */ private static String getFileTypeFormat(String pythonFile) { String[] files = pythonFile.split("\\."); return files[files.length - 1].toUpperCase(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/PythonFileType.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/PythonFileType.java index fd77de76f..04f380db8 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/PythonFileType.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/PythonFileType.java @@ -10,6 +10,9 @@ */ public class PythonFileType implements FileType { + /** + * The location of the single Python ({@code .py}) source file this type represents. + */ private String pythonFile; /** @@ -21,6 +24,15 @@ public PythonFileType(String pythonFile) { this.pythonFile = pythonFile; } + /** + * {@inheritDoc} + * + *

Returns a single-element list containing the bare file name (the segment after the + * last {@code /}) of the configured {@code .py} file. + * + * @return a singleton list holding the Python file name + * @throws PythonFilesEmptyException if no Python file was configured + */ @Override public List getFileNames() { if (pythonFile == null) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/ZipFileType.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/ZipFileType.java index 259b0fca4..757834a6d 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/ZipFileType.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/python/file/type/ZipFileType.java @@ -14,12 +14,29 @@ */ public class ZipFileType implements FileType { + /** + * Source from which the raw ZIP archive bytes are obtained before extraction. + */ private FileSource fileSource; + /** + * Creates a ZIP file type backed by the given file source. + * + * @param fileSource the source supplying the raw ZIP archive bytes + */ public ZipFileType(FileSource fileSource) { this.fileSource = fileSource; } + /** + * {@inheritDoc} + * + *

Downloads the ZIP archive via the configured {@link FileSource}, iterates over its + * entries and collects the names of those that are Python ({@code .py}) files. + * + * @return the names of the Python files contained in the archive + * @throws IOException if the archive cannot be read from the underlying source + */ @Override public List getFileNames() throws IOException { byte[] object = fileSource.getObjectFile(); @@ -41,6 +58,12 @@ public List getFileNames() throws IOException { return fileNames; } + /** + * Determines whether the given entry name refers to a Python source file. + * + * @param fileName the ZIP entry name to test + * @return {@code true} if the name ends with {@code .py}, otherwise {@code false} + */ private boolean isPythonFile(String fileName) { return fileName.endsWith(".py"); } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayAggregate.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayAggregate.java index 79fe4603e..1eb070fc1 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayAggregate.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayAggregate.java @@ -31,7 +31,14 @@ */ public class ArrayAggregate extends ScalarUdf implements Serializable { + /** + * Processor that evaluates the configured aggregation over the supplied array; created lazily on open. + */ private ArrayProcessor arrayProcessor; + + /** + * Holds the JEXL expression describing the aggregation function (for example {@code sum} or {@code avg}). + */ private AggregationExpression expression; /** @@ -41,6 +48,15 @@ public ArrayAggregate() { this.expression = new AggregationExpression(); } + /** + * {@inheritDoc} + * + *

Lazily instantiates the {@link ArrayProcessor} as an {@code ArrayAggregateProcessor} backed by + * the configured {@link AggregationExpression} the first time Flink opens this function instance. + * + * @param context the Flink function context supplied during initialisation + * @throws Exception if the superclass fails to open + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); @@ -66,6 +82,15 @@ public Object eval(Object[] arrayElements, String operationType, String inputDat return arrayProcessor.process(); } + /** + * {@inheritDoc} + * + *

Wires up the input strategy ({@code ARRAY}, operation {@code STRING}, data-type {@code STRING}) + * and the output strategy (a raw {@code Object}) used to type-check and plan this SQL function. + * + * @param typeFactory the factory used to resolve Flink {@link DataType}s + * @return the {@link TypeInference} describing input and output type strategies + */ @Override public TypeInference getTypeInference(DataTypeFactory typeFactory) { TypeInference build = TypeInference.newBuilder() @@ -76,7 +101,19 @@ public TypeInference getTypeInference(DataTypeFactory typeFactory) { } + /** + * Output {@link TypeStrategy} for {@link ArrayAggregate} describing the SQL result type. + * + *

The aggregation may produce any numeric or object value, so the result is reported as a raw + * {@code Object} data type. + */ private static class ArrayAggregateOutputStrategy implements TypeStrategy { + /** + * Infers the output type as a raw {@code Object} data type. + * + * @param callContext the context describing the current SQL call + * @return an {@link Optional} containing the raw {@code Object} {@link DataType} + */ @Override public Optional inferType(CallContext callContext) { DataTypeFactory dataTypeFactory = callContext.getDataTypeFactory(); @@ -86,14 +123,35 @@ public Optional inferType(CallContext cal } } + /** + * Input {@link InputTypeStrategy} for {@link ArrayAggregate} that fixes the call to three arguments. + * + *

The arguments are the input array, the aggregation operation name and the element data type. + */ private static class ArrayAggregateInputStrategy implements InputTypeStrategy { + /** + * The exact number of arguments accepted by the {@code ArrayAggregate} SQL function. + */ private static final Integer ARRAY_AGGREGATE_UDF_FUNCTION_ARG_COUNT = 3; + /** + * Restricts the SQL function to exactly three arguments. + * + * @return a constant {@link ArgumentCount} of three + */ @Override public ArgumentCount getArgumentCount() { return ConstantArgumentCount.of(ARRAY_AGGREGATE_UDF_FUNCTION_ARG_COUNT); } + /** + * Resolves the argument types to an {@code ARRAY} of raw {@code Object} followed by two + * {@code STRING}s (the operation type and the element data type). + * + * @param callContext the context describing the current SQL call + * @param throwOnFailure whether to raise an error when types cannot be inferred + * @return an {@link Optional} holding the resolved list of argument {@link DataType}s + */ @Override public Optional> inferInputTypes(CallContext callContext, boolean throwOnFailure) { DataTypeFactory dataTypeFactory = callContext.getDataTypeFactory(); @@ -102,6 +160,14 @@ public Optional> inferInputTypes(CallContext callContext, boolean return Optional.of(Arrays.asList(new org.apache.flink.table.types.DataType[]{resolvedArgOneType, DataTypes.STRING(), DataTypes.STRING()})); } + /** + * {@inheritDoc} + * + *

This UDF does not advertise explicit call signatures. + * + * @param definition the Flink function definition + * @return {@code null}, as no fixed signatures are declared + */ @Override public List getExpectedSignatures(FunctionDefinition definition) { return null; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayOperate.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayOperate.java index 7bc8fd78e..b8cacd057 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayOperate.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ArrayOperate.java @@ -32,8 +32,19 @@ * The ArrayOperate udf. */ public class ArrayOperate extends ScalarUdf implements Serializable { + /** + * String representation of the configured operation expression. + */ private String expressionString; + + /** + * Processor that applies the configured element-wise operation to the supplied array; created lazily on open. + */ private ArrayProcessor arrayProcessor; + + /** + * Holds the JEXL expression describing the operation applied to each array element. + */ private OperationExpression expression; /** @@ -43,6 +54,15 @@ public ArrayOperate() { this.expression = new OperationExpression(); } + /** + * {@inheritDoc} + * + *

Lazily instantiates the {@link ArrayProcessor} as an {@code ArrayOperateProcessor} backed by the + * configured {@link OperationExpression} the first time Flink opens this function instance. + * + * @param context the Flink function context supplied during initialisation + * @throws Exception if the superclass fails to open + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); @@ -67,6 +87,15 @@ public Object[] eval(Object[] arrayElements, String operationType, String inputD return getCopyArray(arrayProcessor.process()); } + /** + * {@inheritDoc} + * + *

Wires up the input strategy ({@code ARRAY}, operation {@code STRING}, data-type {@code STRING}) + * and the output strategy (an {@code ARRAY} of raw {@code Object}) for this SQL function. + * + * @param typeFactory the factory used to resolve Flink {@link DataType}s + * @return the {@link TypeInference} describing input and output type strategies + */ @Override public TypeInference getTypeInference(DataTypeFactory typeFactory) { TypeInference build = TypeInference.newBuilder() @@ -76,19 +105,49 @@ public TypeInference getTypeInference(DataTypeFactory typeFactory) { return build; } + /** + * Copies an arbitrary (possibly primitive) array into a new {@code Object[]} via reflection. + * + *

This normalises the processor output, which may be a primitive array, into a boxed object array + * suitable for returning to Flink SQL. + * + * @param originalArray the source array, reflectively accessed element by element + * @return a new {@code Object[]} containing the same elements as {@code originalArray} + */ private Object[] getCopyArray(Object originalArray) { int arrayLen = Array.getLength(originalArray); return IntStream.range(0, arrayLen).mapToObj(i -> Array.get(originalArray, i)).toArray(); } + /** + * Input {@link InputTypeStrategy} for {@link ArrayOperate} that fixes the call to three arguments. + * + *

The arguments are the input array, the operation name and the element data type. + */ private static class ArrayOperateInputTypeStrategy implements InputTypeStrategy { + /** + * The exact number of arguments accepted by the {@code ArrayOperate} SQL function. + */ private static final Integer ARRAY_OPERATE_UDF_FUNCTION_ARG_COUNT = 3; + /** + * Restricts the SQL function to exactly three arguments. + * + * @return a constant {@link ArgumentCount} of three + */ @Override public ArgumentCount getArgumentCount() { return ConstantArgumentCount.of(ARRAY_OPERATE_UDF_FUNCTION_ARG_COUNT); } + /** + * Resolves the argument types to an {@code ARRAY} of raw {@code Object} followed by two + * {@code STRING}s (the operation type and the element data type). + * + * @param callContext the context describing the current SQL call + * @param throwOnFailure whether to raise an error when types cannot be inferred + * @return an {@link Optional} holding the resolved list of argument {@link DataType}s + */ @Override public Optional> inferInputTypes(CallContext callContext, boolean throwOnFailure) { DataTypeFactory dataTypeFactory = callContext.getDataTypeFactory(); @@ -97,6 +156,14 @@ public Optional> inferInputTypes(CallContext callContext, boolean return Optional.of(Arrays.asList(new DataType[]{resolvedArgOneType, DataTypes.STRING(), DataTypes.STRING()})); } + /** + * {@inheritDoc} + * + *

This UDF does not advertise explicit call signatures. + * + * @param definition the Flink function definition + * @return {@code null}, as no fixed signatures are declared + */ @Override public List getExpectedSignatures(FunctionDefinition definition) { return null; @@ -104,7 +171,18 @@ public List getExpectedSignatures(FunctionDefinition definition) { } + /** + * Output {@link TypeStrategy} for {@link ArrayOperate} describing the SQL result type. + * + *

The result is reported as an {@code ARRAY} of raw {@code Object} values. + */ private static class ArrayOperateOutputStrategy implements TypeStrategy { + /** + * Infers the output type as an {@code ARRAY} of raw {@code Object}. + * + * @param callContext the context describing the current SQL call + * @return an {@link Optional} containing the array {@link DataType} + */ @Override public Optional inferType(CallContext callContext) { DataTypeFactory dataTypeFactory = callContext.getDataTypeFactory(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ByteToString.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ByteToString.java index 5d68d577b..343843bd6 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ByteToString.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ByteToString.java @@ -18,6 +18,12 @@ import java.util.List; import java.util.Optional; +/** + * Flink SQL {@link ScalarUdf} that converts a protobuf {@code ByteString} field into a UTF-8 {@code String}. + * + *

It exposes a single {@code eval} entry point and a custom {@link TypeInference} so the engine treats + * the byte field as a {@code STRING} result. + */ public class ByteToString extends ScalarUdf { /** * Given a ByteString, this UDF converts to String. @@ -29,6 +35,15 @@ public String eval(ByteString byteField) { return byteField.toStringUtf8(); } + /** + * {@inheritDoc} + * + *

Declares that the function accepts a single argument (the byte field) and returns a + * {@code STRING}. + * + * @param typeFactory the factory used to resolve Flink {@link DataType}s + * @return the {@link TypeInference} describing input and output type strategies + */ @Override public TypeInference getTypeInference(DataTypeFactory typeFactory) { TypeInference build = TypeInference.newBuilder() @@ -37,25 +52,57 @@ public TypeInference getTypeInference(DataTypeFactory typeFactory) { return build; } + /** + * Output {@link TypeStrategy} for {@link ByteToString} that always reports a {@code STRING} result. + */ private static class ByteStringOutputStrategy implements TypeStrategy { + /** + * Infers the output type as {@code STRING}. + * + * @param callContext the context describing the current SQL call + * @return an {@link Optional} containing the {@code STRING} {@link DataType} + */ @Override public Optional inferType(CallContext callContext) { return Optional.of(DataTypes.STRING()); } } + /** + * Input {@link InputTypeStrategy} for {@link ByteToString} that accepts a single argument. + */ private static class ByteStringInputStrategy implements InputTypeStrategy { + /** + * Restricts the SQL function to exactly one argument. + * + * @return a constant {@link ArgumentCount} of one + */ @Override public ArgumentCount getArgumentCount() { return ConstantArgumentCount.of(1); } + /** + * Passes through the single supplied argument data type unchanged. + * + * @param callContext the context describing the current SQL call + * @param throwOnFailure whether to raise an error when types cannot be inferred + * @return an {@link Optional} holding the single argument {@link DataType} + */ @Override public Optional> inferInputTypes(CallContext callContext, boolean throwOnFailure) { DataType dataType = callContext.getArgumentDataTypes().get(0); return Optional.of(Arrays.asList(dataType)); } + /** + * {@inheritDoc} + * + *

This UDF does not advertise explicit call signatures. + * + * @param definition the Flink function definition + * @return {@code null}, as no fixed signatures are declared + */ @Override public List getExpectedSignatures(FunctionDefinition definition) { return null; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartContains.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartContains.java index f7c41c2a5..7acad8ec3 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartContains.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartContains.java @@ -12,7 +12,14 @@ * The DartContains udf. */ public class DartContains extends DartScalarUdf { + /** + * Backing store from which Dart set collections are fetched (for example Redis or GCS). + */ private final DartDataStore dataStore; + + /** + * In-memory cache of fetched sets keyed by list name, refreshed when its TTL expires. + */ private final Map setCache; /** @@ -81,6 +88,15 @@ public boolean eval(String listName, String field, String regex, Integer refresh return isPresent; } + /** + * Returns the cached {@code SetCache} for the given list, refreshing it from the data store when the + * cache is empty, missing the entry, or the entry has expired. + * + * @param listName the name of the Dart collection to look up + * @param field the field being evaluated (used by the underlying store lookup) + * @param refreshRateInHours the cache time-to-live, in hours, after which the set is re-fetched + * @return the up-to-date {@code SetCache} for {@code listName} + */ private SetCache getListData(String listName, String field, int refreshRateInHours) { if (setCache.isEmpty() || !setCache.containsKey(listName) || setCache.get(listName).hasExpired(refreshRateInHours) || setCache.get(listName).isEmpty()) { setCache.put(listName, dataStore.getSet(listName, getMeterStatsManager(), getGaugeStatsManager())); @@ -89,6 +105,11 @@ private SetCache getListData(String listName, String field, int refreshRateInHou return setCache.get(listName); } + /** + * Records a cache-hit or cache-miss metric based on whether the value was found. + * + * @param isPresent {@code true} to mark a Dart cache hit, {@code false} to mark a cache miss + */ private void updateMetrics(boolean isPresent) { if (isPresent) { getMeterStatsManager().markEvent(DartAspects.DART_CACHE_HIT); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartGet.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartGet.java index 8242fcfeb..b7d089e6f 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartGet.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/DartGet.java @@ -13,7 +13,14 @@ * The DartGet udf. */ public class DartGet extends DartScalarUdf { + /** + * Backing store from which Dart map collections are fetched (for example Redis or GCS). + */ private final DartDataStore dataStore; + + /** + * In-memory cache of fetched maps keyed by collection name, refreshed when its TTL expires. + */ private final Map cache; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Distance.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Distance.java index a053d1446..83c3fbc49 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Distance.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Distance.java @@ -7,7 +7,14 @@ */ public class Distance extends ScalarUdf { + /** + * Mean radius of the Earth in kilometres, used to scale the haversine result into a distance. + */ private static final int RADIUS_OF_EARTH = 6371; + + /** + * Divisor ({@code 180}) used together with {@link Math#PI} to convert degrees into radians. + */ private static final int DEGREE_TO_RADIAN_DIVISOR = 180; /** @@ -33,6 +40,12 @@ public Double eval(Double latitude1, Double longitude1, Double latitude2, Double return RADIUS_OF_EARTH * c; } + /** + * Converts an angle expressed in degrees into radians. + * + * @param value the angle in degrees + * @return the equivalent angle in radians + */ private static Double degreeToRadian(Double value) { return value * Math.PI / DEGREE_TO_RADIAN_DIVISOR; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ElementAt.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ElementAt.java index 3a9125483..434889243 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ElementAt.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ElementAt.java @@ -27,11 +27,34 @@ * The ElementAt udf. */ public class ElementAt extends ScalarUdf { + /** + * Mapping of table name to its protobuf class name, used to resolve the descriptor for array elements. + */ private LinkedHashMap protos; + + /** + * Orchestrator used to obtain the {@link StencilClient} for protobuf descriptor lookups. + */ private final StencilClientOrchestrator stencilClientOrchestrator; + + /** + * Cached Stencil client resolved on open and reused for descriptor lookups. + */ private StencilClient stencilClient; + + /** + * Maximum number of arguments accepted by the {@code ElementAt} SQL function. + */ private static final int MAX_ARG_COUNT = 5; + + /** + * Minimum number of arguments accepted by the {@code ElementAt} SQL function. + */ private static final int MINIMUM_ARG_COUNT = 2; + + /** + * Argument count used when the function is applied to a single-table (non-join) query. + */ private static final int ARG_COUNT_WHEN_SINGLE_TABLE_QUERY = 4; @@ -46,6 +69,15 @@ public ElementAt(LinkedHashMap protos, StencilClientOrchestrator this.stencilClientOrchestrator = stencilClientOrchestrator; } + /** + * {@inheritDoc} + * + *

Resolves and caches the {@link StencilClient} on first open so protobuf descriptors can be + * looked up while evaluating array elements. + * + * @param context the Flink function context supplied during initialisation + * @throws Exception if the superclass fails to open + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); @@ -163,6 +195,15 @@ public StencilClient getStencilClient() { return stencilClientOrchestrator.getStencilClient(); } + /** + * {@inheritDoc} + * + *

Configures the variable-arity input strategy (two, four or five arguments) and a {@code STRING} + * output type for this SQL function. + * + * @param typeFactory the factory used to resolve Flink {@link DataType}s + * @return the {@link TypeInference} describing input and output type strategies + */ @Override public TypeInference getTypeInference(DataTypeFactory typeFactory) { return TypeInference.newBuilder() @@ -171,18 +212,44 @@ public TypeInference getTypeInference(DataTypeFactory typeFactory) { .build(); } + /** + * Output {@link TypeStrategy} for {@link ElementAt} that always reports a {@code STRING} result. + */ private static class ElementAtOutputTypeStrategy implements TypeStrategy { + /** + * Infers the output type as {@code STRING}. + * + * @param callContext the context describing the current SQL call + * @return an {@link Optional} containing the {@code STRING} {@link DataType} + */ @Override public Optional inferType(CallContext callContext) { return Optional.of(DataTypes.STRING()); } } + /** + * Null-safely converts a value to its {@code String} representation. + * + * @param value the value to stringify; may be {@code null} + * @return the string form of {@code value}, or {@code null} when {@code value} is {@code null} + */ private String getString(Object value) { return value == null ? null : String.valueOf(value); } + /** + * Input {@link InputTypeStrategy} for {@link ElementAt} accepting two, four or five arguments. + * + *

Two arguments select from a plain object array, four arguments target a single table, and five + * arguments additionally specify the table name for joined streams. + */ private static class ElementAtInputTypeStrategy implements InputTypeStrategy { + /** + * Builds an {@link ArgumentCount} that accepts the two-, four- or five-argument forms of the UDF. + * + * @return an {@link ArgumentCount} validating the supported argument counts + */ @Override public ArgumentCount getArgumentCount() { return new ArgumentCount() { @@ -206,6 +273,15 @@ public Optional getMaxCount() { }; } + /** + * Resolves the argument types depending on the call arity: the supplied types for the two-argument + * form, or the array type followed by {@code STRING}/{@code INT} positional types for the four- and + * five-argument forms. + * + * @param callContext the context describing the current SQL call + * @param throwOnFailure whether to raise an error when types cannot be inferred + * @return an {@link Optional} holding the resolved list of argument {@link DataType}s + */ @Override public Optional> inferInputTypes(CallContext callContext, boolean throwOnFailure) { List argumentDataTypes = callContext.getArgumentDataTypes(); @@ -219,6 +295,14 @@ public Optional> inferInputTypes(CallContext callContext, boolean return Optional.of(Arrays.asList(argumentDataTypes.get(0), DataTypes.STRING(), DataTypes.INT(), DataTypes.STRING(), DataTypes.STRING())); } + /** + * {@inheritDoc} + * + *

This UDF does not advertise explicit call signatures. + * + * @param definition the Flink function definition + * @return {@code null}, as no fixed signatures are declared + */ @Override public List getExpectedSignatures(FunctionDefinition definition) { return null; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfMonth.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfMonth.java index e44950953..4e431ec22 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfMonth.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfMonth.java @@ -11,9 +11,24 @@ */ public class EndOfMonth extends ScalarUdf { + /** + * Hour-of-day ({@code 23}) representing the final hour, used to roll a timestamp to the end of the day. + */ private static final Integer END_OF_DAY_HOUR = 23; + + /** + * Minute and second value ({@code 59}) marking the last minute and second of the day. + */ private static final Integer END_OF_DAY_MINUTE_AND_SECOND = 59; + + /** + * Largest millisecond value ({@code 999}) within a second, used to reach the very end of the day. + */ private static final Integer MAX_MILLISECONDS = 999; + + /** + * Number of milliseconds in one second, used to convert between seconds and milliseconds. + */ private static final Integer SECOND_IN_MILLIS = 1000; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfWeek.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfWeek.java index 1a83380dc..7df360286 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfWeek.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/EndOfWeek.java @@ -11,9 +11,24 @@ */ public class EndOfWeek extends ScalarUdf { + /** + * Hour-of-day ({@code 23}) representing the final hour, used to roll a timestamp to the end of the day. + */ private static final Integer END_OF_DAY_HOUR = 23; + + /** + * Minute and second value ({@code 59}) marking the last minute and second of the day. + */ private static final Integer END_OF_DAY_MINUTE_AND_SECOND = 59; + + /** + * Number of days in a week ({@code 7}), added to advance from the start of the week to its end. + */ private static final Integer DAY_SPAN = 7; + + /** + * Largest millisecond value ({@code 999}) within a second, used to reach the very end of the day. + */ private static final Integer MAX_MILLISECONDS = 999; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ExponentialMovingAverage.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ExponentialMovingAverage.java index 958db2339..232ac5f3d 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ExponentialMovingAverage.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/ExponentialMovingAverage.java @@ -16,6 +16,9 @@ * The ExponentialMovingAverage udf. */ public class ExponentialMovingAverage extends ScalarUdf { + /** + * Number of milliseconds in one minute, used to bucket timestamps into per-minute positions. + */ private static final long MILLI_SECONDS_IN_MINUTE = 60000; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Filters.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Filters.java index b4f6e2bd5..2f9c456ad 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Filters.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Filters.java @@ -21,7 +21,14 @@ */ public class Filters extends ScalarUdf { + /** + * Orchestrator used to obtain the {@link StencilClient} for protobuf descriptor lookups. + */ private StencilClientOrchestrator stencilClientOrchestrator; + + /** + * Stencil client used to resolve protobuf descriptors; supplied directly or via the orchestrator. + */ private StencilClient stencilClient; /** @@ -42,6 +49,14 @@ public Filters(StencilClient stencilClient) { this.stencilClient = stencilClient; } + /** + * {@inheritDoc} + * + *

Resolves the {@link StencilClient} used to look up protobuf descriptors when the UDF is opened. + * + * @param context the Flink function context supplied during initialisation + * @throws Exception if the superclass fails to open + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); @@ -81,6 +96,17 @@ public List eval(@DataTypeHint(inputGroup = InputGroup.ANY) Obje return output; } + /** + * Tests a single {@code DynamicMessage} against all supplied predicates, requiring every predicate + * to pass. + * + *

Evaluation short-circuits on the first failing predicate; the message is accepted only when the + * number of satisfied predicates equals the total number supplied. + * + * @param dynamicMessage the decoded protobuf message to evaluate + * @param predicates the predicates to apply; a message must satisfy all of them + * @return {@code true} if the message satisfies every predicate, otherwise {@code false} + */ @SafeVarargs private final boolean testDynamicMessage(DynamicMessage dynamicMessage, Predicate... predicates) { int counter = 0; @@ -94,6 +120,13 @@ private final boolean testDynamicMessage(DynamicMessage dynamicMessage, Predicat return counter == predicates.length; } + /** + * Resolves the protobuf {@link Descriptors.Descriptor} for the given class name via the Stencil client. + * + * @param protoClassName the fully-qualified protobuf message class name + * @return the resolved descriptor for {@code protoClassName} + * @throws ClassNotFoundException if no descriptor is registered for {@code protoClassName} + */ private Descriptors.Descriptor getDescriptor(String protoClassName) throws ClassNotFoundException { Descriptors.Descriptor descriptor = getStencilClient().get(protoClassName); if (descriptor == null) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/LinearTrend.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/LinearTrend.java index b850580f6..ffee5618b 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/LinearTrend.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/LinearTrend.java @@ -14,6 +14,9 @@ * The Linear trend udf. */ public class LinearTrend extends ScalarUdf { + /** + * Number of milliseconds in one minute, used to bucket timestamps into per-minute positions. + */ private static final long MILLI_SECONDS_IN_MINUTE = 60000; /** @@ -31,6 +34,17 @@ public double eval(@DataTypeHint(value = "RAW", bridgedTo = ArrayList.class) Arr return calculateLinearTrend(timestamps, values, Timestamp.valueOf(hopStartTime), windowLengthInMinutes); } + /** + * Computes the gradient of the best-fit line over the demand values arranged across the time window. + * + *

The slope is derived as the ratio of the time/value covariance to the time variance. + * + * @param timestampsArray the timestamps associated with each value + * @param valueList the demand values aligned with {@code timestampsArray} + * @param hopStartTime the start time of the hop window + * @param windowLengthInMinutes the length of the window, in minutes + * @return the gradient (slope) of the best-fit line + */ private double calculateLinearTrend(ArrayList timestampsArray, ArrayList valueList, Timestamp hopStartTime, Integer windowLengthInMinutes) { ArrayList hopWindowList = IntStream.range(0, windowLengthInMinutes).mapToObj(i -> (double) i).collect(Collectors.toCollection(ArrayList::new)); ArrayList orderedValueList = getOrderedValueList(hopStartTime, valueList, timestampsArray, windowLengthInMinutes); @@ -40,6 +54,15 @@ private double calculateLinearTrend(ArrayList timestampsArray, ArrayL return (timeValueCovariance / timeVariance); } + /** + * Places each value at its per-minute position within the window, leaving gaps as {@code 0}. + * + * @param hopStartTime the start time of the hop window + * @param valueList the demand values to order + * @param timestampsArray the timestamps associated with each value + * @param windowLengthInMinutes the length of the window, in minutes + * @return a list of length {@code windowLengthInMinutes} with values placed at their time positions + */ private ArrayList getOrderedValueList(Timestamp hopStartTime, ArrayList valueList, ArrayList timestampsArray, int windowLengthInMinutes) { ArrayList orderedValueList = new ArrayList<>(Collections.nCopies(windowLengthInMinutes, 0d)); IntStream.range(0, valueList.size()).forEach(index -> { @@ -51,6 +74,13 @@ private ArrayList getOrderedValueList(Timestamp hopStartTime, ArrayList< return orderedValueList; } + /** + * Computes the minute offset of a value's timestamp relative to the hop window start. + * + * @param valueStartTime the timestamp of the value + * @param hopStartTime the start time of the hop window + * @return the zero-based position, in minutes, of the value within the window + */ private int getPosition(Timestamp valueStartTime, Timestamp hopStartTime) { long hopStartMS = hopStartTime.getTime(); long valueStartMS = valueStartTime.getTime(); @@ -59,24 +89,58 @@ private int getPosition(Timestamp valueStartTime, Timestamp hopStartTime) { return (int) deltaInMinute; } + /** + * Computes the (unnormalised) variance of the supplied series across the window. + * + * @param list the series of values (the time positions) + * @param hopWindowLength the number of positions in the window + * @return the variance term used in the linear-trend calculation + */ private double getVariance(ArrayList list, int hopWindowLength) { return getSumOfAnArray(getSquareArray(list)) - Math.pow(getSumOfAnArray(list), 2) / hopWindowLength; } + /** + * Computes the (unnormalised) covariance between two equal-length series across the window. + * + * @param listOne the first series (the time positions) + * @param listTwo the second series (the ordered values) + * @param hopWindowLength the number of positions in the window + * @return the covariance term used in the linear-trend calculation + */ private double getCovariance(ArrayList listOne, ArrayList listTwo, int hopWindowLength) { return getSumOfAnArray(multiplyListsOfSameLength(listOne, listTwo)) - getSumOfAnArray(listOne) * getSumOfAnArray(listTwo) / hopWindowLength; } + /** + * Multiplies two equal-length lists element by element. + * + * @param listOne the first list of operands + * @param listTwo the second list of operands + * @return a new list whose elements are the pairwise products of the inputs + */ private ArrayList multiplyListsOfSameLength(ArrayList listOne, ArrayList listTwo) { ArrayList arrayAfterMultiplication = new ArrayList<>(); IntStream.range(0, listOne.size()).forEach(index -> arrayAfterMultiplication.add(index, listOne.get(index) * listTwo.get(index))); return arrayAfterMultiplication; } + /** + * Squares each element of the supplied list. + * + * @param array the list of values to square + * @return a new list containing the square of each input element + */ private ArrayList getSquareArray(ArrayList array) { return array.stream().map(element -> element * element).collect(Collectors.toCollection(ArrayList::new)); } + /** + * Sums all elements of the supplied list. + * + * @param array the list of values to sum + * @return the sum of all elements in {@code array} + */ private double getSumOfAnArray(ArrayList array) { return array.stream().mapToDouble(element -> element).sum(); } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/MapGet.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/MapGet.java index d0f27031c..8e08ce882 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/MapGet.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/MapGet.java @@ -41,6 +41,15 @@ public Object eval(Row[] inputMap, Object key) { return requiredRow.map(row -> row.getField(1)).orElse(null); } + /** + * {@inheritDoc} + * + *

Declares the input strategy (a map represented as an {@code ARRAY} of key/value {@code ROW}s plus + * a key argument) and an output strategy that yields the map's value type. + * + * @param typeFactory the factory used to resolve Flink {@link DataType}s + * @return the {@link TypeInference} describing input and output type strategies + */ @Override public TypeInference getTypeInference(DataTypeFactory typeFactory) { return TypeInference.newBuilder() @@ -49,7 +58,16 @@ public TypeInference getTypeInference(DataTypeFactory typeFactory) { .build(); } + /** + * Output {@link TypeStrategy} for {@link MapGet} that reports the map's value type as the result type. + */ private static class MapOutputTypeStrategy implements TypeStrategy { + /** + * Infers the output type as the value (second) field of the map's key/value row. + * + * @param callContext the context describing the current SQL call + * @return an {@link Optional} containing the value {@link DataType} of the map + */ @Override public Optional inferType(CallContext callContext) { CollectionDataType firstArgumentDataType = (CollectionDataType) callContext.getArgumentDataTypes().get(0); @@ -59,12 +77,28 @@ public Optional inferType(CallContext callContext) { } } + /** + * Input {@link InputTypeStrategy} for {@link MapGet} accepting the map and the lookup key. + */ private static class MapGetInputTypeStrategy implements InputTypeStrategy { + /** + * Restricts the SQL function to exactly two arguments. + * + * @return a constant {@link ArgumentCount} of two + */ @Override public ArgumentCount getArgumentCount() { return ConstantArgumentCount.of(2); } + /** + * Resolves the argument types to an {@code ARRAY} of key/value {@code ROW}s (derived from the map) + * followed by the key's data type. + * + * @param callContext the context describing the current SQL call + * @param throwOnFailure whether to raise an error when types cannot be inferred + * @return an {@link Optional} holding the resolved list of argument {@link DataType}s + */ @Override public Optional> inferInputTypes(CallContext callContext, boolean throwOnFailure) { CollectionDataType firstArgumentDataType = (CollectionDataType) callContext.getArgumentDataTypes().get(0); @@ -75,6 +109,14 @@ public Optional> inferInputTypes(CallContext callContext, boolean return Optional.of(Arrays.asList(mapDataType, secondArgumentDataType)); } + /** + * {@inheritDoc} + * + *

This UDF does not advertise explicit call signatures. + * + * @param definition the Flink function definition + * @return {@code null}, as no fixed signatures are declared + */ @Override public List getExpectedSignatures(FunctionDefinition definition) { return null; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/S2AreaInKm2.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/S2AreaInKm2.java index ea6588e32..064dce7b7 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/S2AreaInKm2.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/S2AreaInKm2.java @@ -9,7 +9,14 @@ */ public class S2AreaInKm2 extends ScalarUdf { + /** + * Total surface area of the Earth in square kilometres, used to scale the cell's fractional area. + */ private static final long TOTAL_EARTH_AREA_KM2 = 510072000; + + /** + * Constant factor ({@code 4}) from the sphere-area formula {@code 4 * PI}, used to normalise the area. + */ private static final long FACTOR = 4; /** @@ -26,6 +33,12 @@ public double eval(String s2id) { return (s2Cell.exactArea() * TOTAL_EARTH_AREA_KM2) / (FACTOR * Math.PI); } + /** + * Builds the {@code S2Cell} corresponding to the given S2 cell identifier. + * + * @param id the numeric S2 cell identifier + * @return the {@code S2Cell} for the supplied identifier + */ private S2Cell getS2CellfromId(long id) { return new S2Cell(new S2CellId(id)); } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/SelectFields.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/SelectFields.java index 5a09be319..9c6574b7a 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/SelectFields.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/SelectFields.java @@ -32,8 +32,19 @@ * The SelectFields udf. */ public class SelectFields extends ScalarUdf { + /** + * Orchestrator used to obtain the {@link StencilClient} for protobuf descriptor lookups. + */ private StencilClientOrchestrator stencilClientOrchestrator; + + /** + * Stencil client used to resolve protobuf descriptors; supplied directly or via the orchestrator. + */ private StencilClient stencilClient; + + /** + * Parser used to read the requested field path out of each decoded protobuf message. + */ private MessageParser messageParser; /** @@ -56,6 +67,15 @@ public SelectFields(StencilClient stencilClient) { this.messageParser = new MessageParser(); } + /** + * {@inheritDoc} + * + *

Resolves and caches the {@link StencilClient} on first open so protobuf descriptors can be + * looked up while selecting fields. + * + * @param context the Flink function context supplied during initialisation + * @throws Exception if the superclass fails to open + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); @@ -113,6 +133,13 @@ public Object[] eval(@DataTypeHint(value = "RAW", bridgedTo = List.class) ListDeclares an output strategy producing an {@code ARRAY} of raw {@code Object} values, matching the + * list of selected field values returned by {@code eval}. + * + * @param typeFactory the factory used to resolve Flink {@link DataType}s + * @return the {@link TypeInference} describing the output type strategy + */ @Override public TypeInference getTypeInference(DataTypeFactory typeFactory) { TypeInference build = TypeInference.newBuilder() @@ -143,7 +179,16 @@ public TypeInference getTypeInference(DataTypeFactory typeFactory) { } + /** + * Output {@link TypeStrategy} for {@link SelectFields} that reports an {@code ARRAY} of raw {@code Object}. + */ private static class SelectFieldsOutputStrategy implements TypeStrategy { + /** + * Infers the output type as an {@code ARRAY} of raw {@code Object}. + * + * @param callContext the context describing the current SQL call + * @return an {@link Optional} containing the array {@link DataType} + */ @Override public Optional inferType(CallContext callContext) { DataTypeFactory dataTypeFactory = callContext.getDataTypeFactory(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Split.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Split.java index 0858ba731..48758a7f7 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Split.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/Split.java @@ -8,6 +8,9 @@ * The Split udf. */ public class Split extends ScalarUdf { + /** + * Logger used to record when an input string cannot be split (for example a {@code null} input). + */ private static final Logger LOGGER = LoggerFactory.getLogger(Split.class.getName()); /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfMonth.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfMonth.java index 8ff9fd7a0..46bab1080 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfMonth.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfMonth.java @@ -11,8 +11,19 @@ */ public class StartOfMonth extends ScalarUdf { + /** + * First hour of the day ({@code 0}), used to roll a timestamp back to the start of the day. + */ private static final Integer FIRST_HOUR_OF_DAY = 0; + + /** + * Number of seconds in a day ({@code 86400}), used when normalising the day-of-week field. + */ private static final Integer DURATION_OF_DAY_IN_SECONDS = 86400; + + /** + * Number of milliseconds in one second, used to convert between seconds and milliseconds. + */ private static final Integer SECOND_IN_MILLIS = 1000; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfWeek.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfWeek.java index 7e5136171..a7d09a68e 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfWeek.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/StartOfWeek.java @@ -11,7 +11,14 @@ */ public class StartOfWeek extends ScalarUdf { + /** + * First hour of the day ({@code 0}), used to roll a timestamp back to the start of the day. + */ private static final Integer FIRST_HOUR_OF_DAY = 0; + + /** + * Number of seconds in a day ({@code 86400}), used when normalising the day-of-week field. + */ private static final Integer DURATION_OF_DAY_IN_SECONDS = 86400; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimeInDate.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimeInDate.java index e491a83a7..9c1219865 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimeInDate.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimeInDate.java @@ -10,6 +10,9 @@ * The Time in date udf. */ public class TimeInDate extends ScalarUdf { + /** + * Number of milliseconds in one second, used to convert between seconds and milliseconds. + */ private static final Integer SECOND_IN_MILLIS = 1000; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimestampFromUnix.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimestampFromUnix.java index 5c186a5a6..23675d0b5 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimestampFromUnix.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/TimestampFromUnix.java @@ -10,6 +10,9 @@ */ public class TimestampFromUnix extends ScalarUdf { + /** + * Number of milliseconds in one second, used to convert UNIX seconds into a {@link Timestamp}. + */ private static final int SECONDS_TO_MILISECONDS = 1000; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartAspects.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartAspects.java index 2e12ec1b0..6dbf163e1 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartAspects.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartAspects.java @@ -36,19 +36,41 @@ public enum DartAspects implements Aspects { */ DART_GCS_FILE_SIZE("dart_gcs_file_size", Gauge); + /** + * The metric name reported to StatsD for this aspect. + */ private String value; + /** + * The {@link AspectType} that determines how this aspect is reported (e.g. as a gauge or a metric). + */ private AspectType aspectType; + /** + * Instantiates a new Dart aspect. + * + * @param value the metric name reported to StatsD for this aspect + * @param aspectType the type that controls how the aspect is published + */ DartAspects(String value, AspectType aspectType) { this.value = value; this.aspectType = aspectType; } + /** + * Returns the metric name associated with this Dart aspect. + * + * @return the metric name reported to StatsD + */ @Override public String getValue() { return value; } + /** + * Returns the reporting category for this Dart aspect. + * + * @return the {@link AspectType} describing how this aspect is published + */ @Override public AspectType getAspectType() { return aspectType; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartScalarUdf.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartScalarUdf.java index df0bc89b1..5398fd006 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartScalarUdf.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/DartScalarUdf.java @@ -8,12 +8,42 @@ import static com.gotocompany.dagger.common.core.Constants.UDF_TELEMETRY_GROUP_KEY; +/** + * Base class for the "dart" family of Dagger scalar UDFs that enrich rows by looking up values in an + * external object store (GCS, OSS or COS). + * + *

The concrete subclasses {@code DartGet} and {@code DartContains} are registered into Flink SQL + * and, on every {@code eval(...)} call, resolve a key against a JSON "dart" file cached from the + * store. This base class extends {@link ScalarUdf} to layer dart-specific telemetry on top of the + * generic UDF presence gauge: it owns a {@link MeterStatsManager} configured with all + * {@link DartAspects} so the subclasses can mark cache hits/misses and store fetch outcomes. + */ public abstract class DartScalarUdf extends ScalarUdf { + /** + * Per-subtask meter manager used by dart UDFs to publish their {@link DartAspects} telemetry + * (cache hits/misses, store fetch success/failure). + * + *

It is created and registered in {@link #open(FunctionContext)} and read through its + * Lombok-generated getter; the generated setter is provided purely so tests can inject a mock + * without a running Flink runtime. + */ @Getter @Setter // For testing purpose only private MeterStatsManager meterStatsManager; + /** + * {@inheritDoc} + * + *

After the base {@link ScalarUdf#open(FunctionContext)} sets up the generic UDF gauge, this + * implementation builds a {@link MeterStatsManager} from the function's metric group and + * registers every {@link DartAspects} constant under the UDF telemetry group, keyed by this + * function's {@link #getName() name}. This makes the dart metric meters available before the + * first lookup is evaluated. + * + * @param context the Flink function context, used to obtain the metric group + * @throws Exception if the superclass initialization fails + */ @Override public void open(FunctionContext context) throws Exception { super.open(context); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClient.java index f534ab6db..7cec7cbca 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClient.java @@ -4,6 +4,14 @@ import com.gotocompany.dagger.functions.exceptions.BucketDoesNotExistException; import com.gotocompany.dagger.functions.exceptions.TagDoesNotExistException; +/** + * Low-level client that downloads raw dart JSON content from a specific object-storage provider. + * + *

Each backend (GCS, OSS, COS) supplies its own implementation, which is selected at runtime by + * {@link DartDataStoreClientProvider}. {@link DefaultDartDataStore} layers parsing and caching on top + * of this client, so a new backend usually only needs to implement this single download method; a + * fully custom store should instead implement {@link DartDataStore} directly. + */ public interface DartDataStoreClient { /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClientProvider.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClientProvider.java index 8b6399ac0..f82b8622c 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClientProvider.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DartDataStoreClientProvider.java @@ -8,20 +8,51 @@ import java.io.Serializable; +/** + * Serializable factory that lazily creates the {@link DartDataStoreClient} for the configured object + * storage provider. + * + *

Provider implementations (for example the GCP {@code Storage} client) are generally not + * {@link Serializable}, which would break Flink's operator distribution if a client instance were + * held in a serialized field. To avoid that, this provider keeps only the lightweight configuration + * needed to build a client and constructs the concrete client on first use, on the task manager + * where it is actually needed. + */ public class DartDataStoreClientProvider implements Serializable { + /** Identifier of the backend to use, one of the {@code UDF_STORE_PROVIDER_*} constants. */ private final String udfStoreProvider; + /** Cloud project id passed to backends (such as GCS) that require it. */ private final String projectID; + /** Dagger configuration consulted for backend-specific settings (endpoint, region, OIDC). */ private final Configuration configuration; // Do not make this final, if so then the implementation of client should be Serializable + /** Lazily instantiated, cached client; intentionally non-final so it is not serialized. */ private DartDataStoreClient dartDataStoreClient; + /** + * Creates a provider capturing the settings needed to build a store client later. + * + * @param udfStoreProvider the backend identifier (one of the {@code UDF_STORE_PROVIDER_*} values) + * @param projectID the cloud project id forwarded to backends that require it (e.g. GCS) + * @param configuration the Dagger configuration supplying backend-specific settings + */ public DartDataStoreClientProvider(String udfStoreProvider, String projectID, Configuration configuration) { this.udfStoreProvider = udfStoreProvider; this.projectID = projectID; this.configuration = configuration; } + /** + * Returns the store client for the configured provider, creating and caching it on first call. + * + *

The client is built from configuration the first time it is requested and reused on + * subsequent calls. {@code GCS} uses the project id, {@code OSS} reads its endpoint, and + * {@code COS} reads its region and OIDC-provider flag from the {@link Configuration}. + * + * @return the lazily created {@link DartDataStoreClient} matching the configured provider + * @throws IllegalArgumentException if the configured provider is not recognized + */ public DartDataStoreClient getDartDataStoreClient() { // In a distributed system, we don't intend the client to be serialized and most of the implementations like // GCP Storage implementation doesn't implement java.io.Serializable interface and you may see the below error diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DefaultDartDataStore.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DefaultDartDataStore.java index 2af965108..8ef5ec5c3 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DefaultDartDataStore.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/DefaultDartDataStore.java @@ -27,11 +27,26 @@ */ public class DefaultDartDataStore implements DartDataStore, Serializable { + /** + * The object-storage directory prefix under which {@code dart-get} map payloads are stored. + */ public static final String DART_GET_DIRECTORY = "dart-get/"; + /** + * The object-storage directory prefix under which {@code dart-contains} set payloads are stored. + */ public static final String DART_CONTAINS_DIRECTORY = "dart-contains/"; + /** + * Provider of the backend-specific client used to fetch dart JSON payloads from object storage. + */ private final DartDataStoreClientProvider clientProvider; + /** + * The identifier of the object-storage bucket that holds the dart data. + */ private final String bucketId; + /** + * The Dagger configuration used to resolve dart-related settings. + */ private final Configuration configuration; /** @@ -46,17 +61,50 @@ public DefaultDartDataStore(DartDataStoreClientProvider clientProvider, String b this.configuration = configuration; } + /** + * {@inheritDoc} + * + *

Fetches the {@code dart-contains} payload for the given set name from object storage and + * wraps the resulting values in a {@link SetCache}. + * + * @param setName the name of the dart set to load + * @param meterStatsManager the meter manager used to record fetch successes and failures + * @param gaugeManager the gauge manager used to record payload size and path telemetry + * @return a {@link SetCache} backed by the fetched set of values + */ @Override public SetCache getSet(String setName, MeterStatsManager meterStatsManager, GaugeStatsManager gaugeManager) { return new SetCache(getSetOfObjects(setName, meterStatsManager, gaugeManager)); } + /** + * {@inheritDoc} + * + *

Fetches the {@code dart-get} payload for the given map name from object storage and wraps + * the resulting key-value pairs in a {@link MapCache}. + * + * @param mapName the name of the dart map to load + * @param meterStatsManager the meter manager used to record fetch successes and failures + * @param gaugeManager the gauge manager used to record payload size and path telemetry + * @return a {@link MapCache} backed by the fetched map of values + */ @Override public MapCache getMap(String mapName, MeterStatsManager meterStatsManager, GaugeStatsManager gaugeManager) { Map mapOfObjects = getMapOfObjects(mapName, meterStatsManager, gaugeManager); return new MapCache(mapOfObjects); } + /** + * Fetches and parses the {@code dart-get} JSON payload for the given dart name into a key-value map. + * + *

On a parsing failure the error is recorded via {@link DartAspects#DART_GCS_FETCH_FAILURES} + * and {@code null} is returned. + * + * @param dartName the name of the dart whose map payload should be fetched + * @param meterManager the meter manager used to record fetch failures + * @param gaugeManager the gauge manager used to record path and size telemetry + * @return the parsed {@code Map} of key-value pairs, or {@code null} when the payload cannot be parsed + */ private Map getMapOfObjects(String dartName, MeterStatsManager meterManager, GaugeStatsManager gaugeManager) { String jsonData = clientProvider.getDartDataStoreClient().fetchJsonData( DartGet.class.getSimpleName(), @@ -76,6 +124,17 @@ private Map getMapOfObjects(String dartName, MeterStatsManager m return map; } + /** + * Fetches and parses the {@code dart-contains} JSON payload for the given dart name into a set of values. + * + *

The payload is expected to contain a {@code "data"} array of strings; on any failure the error + * is recorded via {@link DartAspects#DART_GCS_FETCH_FAILURES} and an empty set is returned. + * + * @param dartName the name of the dart whose set payload should be fetched + * @param meterManager the meter manager used to record fetch failures + * @param gaugeManager the gauge manager used to record path and size telemetry + * @return the parsed {@code Set} of values, or an empty set when the payload cannot be parsed + */ private Set getSetOfObjects(String dartName, MeterStatsManager meterManager, GaugeStatsManager gaugeManager) { String jsonData = clientProvider.getDartDataStoreClient().fetchJsonData(DartContains.class.getSimpleName(), gaugeManager, this.bucketId, DART_CONTAINS_DIRECTORY + dartName); ObjectMapper mapper = new ObjectMapper(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/cos/CosDartClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/cos/CosDartClient.java index 3890eeefa..56d5678b3 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/cos/CosDartClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/cos/CosDartClient.java @@ -14,13 +14,33 @@ import static com.gotocompany.dagger.common.core.Constants.UDF_TELEMETRY_GROUP_KEY; +/** + * {@link DartDataStoreClient} backed by Tencent Cloud Object Storage (COS). + * + *

It downloads dart JSON files from a COS bucket using a {@link COSClient} obtained from the + * shared {@link CosLibClient} singleton. Because COS credentials are short-lived tokens, no + * long-lived client is cached here: a client is created per operation, which is acceptable given that + * darts are typically fetched once at job startup. Each successful download also records the dart + * path and file size through the supplied gauge stats manager. + */ public class CosDartClient implements DartDataStoreClient { + /** Divisor used to convert a byte count into kilobytes for the file-size gauge. */ private static final Double BYTES_TO_KB = 1024.0; + /** Metric group key under which the per-dart file-size gauge is registered. */ private static final String DART_PATH = "dartpath"; + /** Whether to authenticate via the TKE OIDC credential provider. */ private final boolean enableTkeOidcProvider; + /** COS region the bucket resides in. */ private final String cosRegion; + /** + * Creates a COS dart client capturing the credentials/region settings used to build a + * {@link COSClient} on demand. + * + * @param enableTkeOidcProvider whether to authenticate using the TKE OIDC credential provider + * @param cosRegion the COS region of the bucket to read from + */ public CosDartClient(boolean enableTkeOidcProvider, String cosRegion) { this.enableTkeOidcProvider = enableTkeOidcProvider; this.cosRegion = cosRegion; @@ -30,6 +50,20 @@ public CosDartClient(boolean enableTkeOidcProvider, String cosRegion) { // Create client when using its operation. } + /** + * Downloads the dart object from COS and returns its content as a JSON string. + * + *

Obtains a {@link COSClient} for the configured region/credentials, reads the object's bytes + * fully into a string, and records the dart path and file size (in KB) on the gauge stats + * manager. + * + * @param udfName the name of the calling UDF, used as the gauge metric group label + * @param gaugeStatsManager gauge manager used to record the dart path and file size + * @param bucketName the COS bucket to read from + * @param dartName the object key of the dart file within the bucket + * @return the raw JSON content of the dart object + * @throws TagDoesNotExistException if the object content cannot be read from COS + */ public String fetchJsonData(String udfName, GaugeStatsManager gaugeStatsManager, String bucketName, String dartName) { COSClient cosClient = CosLibClient.getInstance().get(enableTkeOidcProvider, cosRegion); COSObject cosObject = cosClient.getObject(bucketName, dartName); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/gcs/GcsDartClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/gcs/GcsDartClient.java index 94a7a6748..cd0a99979 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/gcs/GcsDartClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/gcs/GcsDartClient.java @@ -19,9 +19,18 @@ */ public class GcsDartClient implements DartDataStoreClient { + /** + * The GCS storage client used to read dart blobs from buckets. + */ private Storage storage; + /** + * The divisor used to convert blob sizes from bytes to kilobytes when reporting file-size telemetry. + */ private static final Double BYTES_TO_KB = 1024.0; + /** + * The gauge group key under which the dart path is registered for file-size telemetry. + */ private static final String DART_PATH = "dartpath"; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/oss/OssDartClient.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/oss/OssDartClient.java index beb311112..ffb82f719 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/oss/OssDartClient.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/store/oss/OssDartClient.java @@ -16,10 +16,25 @@ import static com.gotocompany.dagger.common.core.Constants.UDF_TELEMETRY_GROUP_KEY; +/** + * The type Oss client. + * + *

A {@link DartDataStoreClient} implementation that fetches dart JSON payloads from Alibaba Cloud + * Object Storage Service (OSS) buckets. + */ public class OssDartClient implements DartDataStoreClient { + /** + * The divisor used to convert object sizes from bytes to kilobytes when reporting file-size telemetry. + */ private static final Double BYTES_TO_KB = 1024.0; + /** + * The gauge group key under which the dart path is registered for file-size telemetry. + */ private static final String DART_PATH = "dartpath"; + /** + * The underlying Alibaba Cloud OSS client used to read dart objects. + */ private final OSS libOssClient; /** @@ -33,6 +48,19 @@ public OssDartClient(String ossEndpoint) { } } + /** + * Fetches the dart JSON payload for the given object from the configured OSS bucket. + * + *

Reads the object content as a string and records dart path and file-size telemetry via the + * supplied gauge manager. + * + * @param udfName the simple name of the UDF requesting the data, used as a telemetry group + * @param gaugeStatsManager the gauge manager used to record path and size telemetry + * @param bucketName the name of the OSS bucket to read from + * @param dartName the object key of the dart payload within the bucket + * @return the dart payload contents as a string + * @throws TagDoesNotExistException if the object content cannot be read from OSS + */ public String fetchJsonData(String udfName, GaugeStatsManager gaugeStatsManager, String bucketName, String dartName) { OSSObject ossObject = libOssClient.getObject(bucketName, dartName); String dartJson; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/Cache.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/Cache.java index 94a9dc37e..469f963f2 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/Cache.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/Cache.java @@ -10,6 +10,9 @@ */ public abstract class Cache implements Serializable { + /** + * The timestamp at which this cache entry was populated, used to evaluate expiry. + */ private Date timeOfCaching; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/MapCache.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/MapCache.java index 0927dbb7b..906c3c184 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/MapCache.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/MapCache.java @@ -18,6 +18,9 @@ public class MapCache extends Cache implements Serializable { * The constant NULL_CACHE. */ public static final MapCache NULL_CACHE = new MapCache(new HashMap(), null); + /** + * The backing key-value pairs held by this cache entry. + */ private Map cache; /** @@ -63,6 +66,12 @@ public boolean isEmpty() { return cache.isEmpty(); } + /** + * Compares this map cache with another object for equality based on the cached key-value pairs. + * + * @param o the object to compare with + * @return {@code true} if the other object is a {@code MapCache} with equal cached contents + */ @Override public boolean equals(Object o) { if (this == o) { @@ -75,6 +84,11 @@ public boolean equals(Object o) { return Objects.equals(cache, mapCache.cache); } + /** + * Returns a hash code derived from the cached key-value pairs. + * + * @return the hash code for this map cache + */ @Override public int hashCode() { return Objects.hash(cache); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/SetCache.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/SetCache.java index bca039b30..22c3f6e2c 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/SetCache.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/dart/types/SetCache.java @@ -14,6 +14,9 @@ public class SetCache extends Cache implements Serializable { * The constant NULL_CACHE. */ public static final SetCache NULL_CACHE = new SetCache(new HashSet<>(), null); + /** + * The backing set of values held by this cache entry. + */ private Set cache; /** @@ -25,6 +28,12 @@ public SetCache(Set cache) { this(cache, new Date()); } + /** + * Instantiates a new Set cache with an explicit caching timestamp. + * + * @param cache the set of values to cache + * @param timeOfCaching the time at which the values were cached, used to evaluate expiry + */ private SetCache(Set cache, Date timeOfCaching) { super(timeOfCaching); this.cache = cache; @@ -49,6 +58,12 @@ public boolean isEmpty() { return cache.isEmpty(); } + /** + * Compares this set cache with another object for equality based on the cached values. + * + * @param o the object to compare with + * @return {@code true} if the other object is a {@code SetCache} with equal cached contents + */ @Override public boolean equals(Object o) { if (this == o) { @@ -63,6 +78,11 @@ public boolean equals(Object o) { return cache != null ? cache.equals(setCache.cache) : setCache.cache == null; } + /** + * Returns a hash code derived from the cached values. + * + * @return the hash code for this set cache + */ @Override public int hashCode() { return cache != null ? cache.hashCode() : 0; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/MessageReader.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/MessageReader.java index 7b4d3c50e..d628a51aa 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/MessageReader.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/MessageReader.java @@ -15,9 +15,21 @@ * The Message reader. */ public class MessageReader { + /** + * The Flink {@link Row} record from which a nested element is read. + */ private Row message; + /** + * The fully-qualified protobuf class name describing the schema of the message. + */ private String protoClassName; + /** + * The dot-separated path identifying the parent message within the proto schema. + */ private String pathOfMessage; + /** + * The Stencil client used to resolve protobuf descriptors by class name. + */ private StencilClient stencilClient; /** @@ -35,6 +47,12 @@ public MessageReader(Row message, String protoClassName, String pathOfMessage, S this.stencilClient = stencilClient; } + /** + * Resolves the root protobuf descriptor for the configured proto class name via the Stencil client. + * + * @return the root {@code Descriptor} for the configured proto class + * @throws ClassNotFoundException if no descriptor is registered for the configured proto class name + */ private Descriptor getRootDescriptor() throws ClassNotFoundException { Descriptor dsc = stencilClient.get(protoClassName); if (dsc == null) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/descriptor/CustomDescriptor.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/descriptor/CustomDescriptor.java index b1b1c39f7..4ef263b04 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/descriptor/CustomDescriptor.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/descriptor/CustomDescriptor.java @@ -12,6 +12,9 @@ * The Custom descriptor. */ public class CustomDescriptor { + /** + * The wrapped protobuf {@code Descriptor} whose fields this helper navigates. + */ private Descriptor descriptor; /** @@ -69,6 +72,12 @@ public Optional get(String path) { return nextDescriptor.map(CustomDescriptor::new); } + /** + * Compares this custom descriptor with another object for equality based on the wrapped descriptor. + * + * @param o the object to compare with + * @return {@code true} if the other object is a {@code CustomDescriptor} wrapping an equal descriptor + */ @Override public boolean equals(Object o) { if (this == o) { @@ -81,6 +90,11 @@ public boolean equals(Object o) { return descriptor != null ? descriptor.equals(that.descriptor) : that.descriptor == null; } + /** + * Returns a hash code derived from the wrapped descriptor. + * + * @return the hash code for this custom descriptor + */ @Override public int hashCode() { return descriptor != null ? descriptor.hashCode() : 0; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/Element.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/Element.java index 167924cb4..32c24a408 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/Element.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/Element.java @@ -10,8 +10,17 @@ * The abstract class of Element. */ public abstract class Element { + /** + * The protobuf field descriptor identifying this element within its parent message. + */ private Descriptors.FieldDescriptor fieldDescriptor; + /** + * The parent element in the path chain, or {@code null} when this is the first element. + */ private Element parent; + /** + * The Flink {@link Row} that holds the value for this element. + */ private Row row; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/RowElement.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/RowElement.java index 847e81687..063006aef 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/RowElement.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/RowElement.java @@ -23,6 +23,12 @@ class RowElement extends Element { super(parent, row, fieldDescriptor); } + /** + * Creates the next element in the path chain for the given child field of this message element. + * + * @param pathElement the name of the child field to descend into + * @return an {@link Optional} containing the next element, or empty when the field is absent + */ public Optional createNext(String pathElement) { Optional childElement = initialize(this, null, new CustomDescriptor(getFieldDescriptor().getMessageType()), pathElement); return childElement; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/ValueElement.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/ValueElement.java index c45d5d581..adfd1a8d1 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/ValueElement.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/elementAt/row/ValueElement.java @@ -22,6 +22,12 @@ class ValueElement extends Element { super(parent, row, fieldDescriptor); } + /** + * Returns an empty result because a leaf value element has no further path elements to descend into. + * + * @param pathElement the name of the requested child field (ignored for value elements) + * @return an always-empty {@link Optional} + */ @Override public Optional createNext(String pathElement) { return Optional.empty(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/MessageParser.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/MessageParser.java index f6c9c2a4f..76f77555f 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/MessageParser.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/MessageParser.java @@ -13,6 +13,9 @@ */ public class MessageParser implements Serializable { + /** + * The converter used to transform protobuf messages into Flink {@code Row} values. + */ private ProtoToRow protoToRow; /** @@ -42,6 +45,16 @@ public Object read(DynamicMessage dynamicMessage, List keys) { } } + /** + * Converts a single resolved protobuf field value into its Flink-compatible representation. + * + *

Message fields are converted to {@code Row} values, enum fields to their string form (single + * or repeated), and repeated string fields to string arrays; all other values are returned as-is. + * + * @param fieldByName the descriptor of the field being parsed + * @param resultField the raw field value extracted from the message + * @return the converted value suitable for use in a Flink {@code Row} + */ private Object parseSingleRow(Descriptors.FieldDescriptor fieldByName, Object resultField) { if (fieldByName.getJavaType() == Descriptors.FieldDescriptor.JavaType.MESSAGE) { return protoToRow.getRow((DynamicMessage) resultField); @@ -59,6 +72,14 @@ private Object parseSingleRow(Descriptors.FieldDescriptor fieldByName, Object re return resultField; } + /** + * Resolves the protobuf field descriptor for the given key within the supplied parent descriptor. + * + * @param key the field name to look up + * @param parentDescriptor the descriptor of the message containing the field + * @return the matching field descriptor + * @throws LongbowException if no field with the given key exists in the parent message + */ private Descriptors.FieldDescriptor getFieldByName(String key, Descriptors.Descriptor parentDescriptor) { Descriptors.FieldDescriptor fieldByName = parentDescriptor.findFieldByName(key); if (fieldByName == null) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/ProtoToRow.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/ProtoToRow.java index b58149b2a..a1081ebb8 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/ProtoToRow.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/ProtoToRow.java @@ -84,12 +84,26 @@ public String[] getStringRow(List protos) { .toArray(String[]::new); } + /** + * Converts a list of protobuf map-entry messages into an array of two-field key-value rows. + * + * @param protos the list of map-entry messages to convert + * @return an array of {@code Row} values, one per map entry + */ private Object[] getMapRow(List protos) { ArrayList rows = new ArrayList<>(); protos.forEach(entry -> rows.add(getRowFromMap(entry))); return rows.toArray(); } + /** + * Converts a single protobuf map-entry message into a two-field {@code Row} of key and value. + * + *

Missing key or value fields default to an empty string. + * + * @param protos the map-entry message to convert + * @return a {@code Row} holding the entry key at index 0 and the entry value at index 1 + */ private Row getRowFromMap(DynamicMessage protos) { Row row = new Row(2); Object[] keyValue = protos.getAllFields().values().toArray(); @@ -112,6 +126,12 @@ public String[] getRowForStringList(List listField) { return list; } + /** + * Copies a list of protobuf {@code ByteString} values into a {@code ByteString} array. + * + * @param listField the list of byte-string values to copy + * @return an array containing the same byte-string values in order + */ private ByteString[] getRowForByteString(List listField) { ByteString[] byteStrings = new ByteString[listField.size()]; for (int listIndex = 0; listIndex < listField.size(); listIndex++) { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/LongbowArrayType.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/LongbowArrayType.java index 23949c458..cc0f2cb93 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/LongbowArrayType.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/LongbowArrayType.java @@ -11,16 +11,45 @@ * The enum Data type. */ public enum LongbowArrayType implements Serializable { + /** + * Casts the array elements to {@code int} values for aggregation. + */ INTEGER((Stream stream) -> (stream.mapToInt(Integer.class::cast))), + /** + * Casts the array elements to {@code int} values for aggregation (alias of {@code INTEGER}). + */ INT((Stream stream) -> (stream.mapToInt(Integer.class::cast))), + /** + * Casts the array elements to {@code double} values for aggregation. + */ DOUBLE((Stream stream) -> (stream.mapToDouble(Double.class::cast))), + /** + * Casts the array elements (originally {@code float}) to {@code double} values for aggregation. + */ FLOAT((Stream stream) -> (stream.mapToDouble(Float.class::cast))), + /** + * Casts the array elements to {@code long} values for aggregation. + */ LONG((Stream stream) -> (stream.mapToLong(Long.class::cast))), + /** + * Casts the array elements to {@code long} values for aggregation (alias of {@code LONG}). + */ BIGINT((Stream stream) -> (stream.mapToLong(Long.class::cast))), + /** + * Leaves the array elements unconverted, passing the object stream through unchanged. + */ OTHER((Stream stream) -> (stream)); + /** + * The function that casts a stream of array elements to the primitive-typed stream used for aggregation. + */ private Function, BaseStream> inputCastingFunction; + /** + * Instantiates a new Longbow array type. + * + * @param inputCastingFunction the function that casts an object stream to the appropriate primitive stream + */ LongbowArrayType(Function, BaseStream> inputCastingFunction) { this.inputCastingFunction = inputCastingFunction; } diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/AggregationExpression.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/AggregationExpression.java index b9a7aa85f..8da51fa49 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/AggregationExpression.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/AggregationExpression.java @@ -4,13 +4,29 @@ * The Aggregation expression. */ public class AggregationExpression implements Expression { + /** + * The JEXL expression string that this aggregation builds and exposes for evaluation. + */ private String expressionString; + /** + * {@inheritDoc} + * + * @return the JEXL expression string built for this aggregation + */ @Override public String getExpressionString() { return expressionString; } + /** + * {@inheritDoc} + * + *

Builds an aggregation expression by appending the operation chain for the given operation type + * to the base stream variable. + * + * @param operationType the dot-separated operation chain to apply (for example {@code "sum"}) + */ @Override public void createExpression(String operationType) { this.expressionString = BASE_STRING + getOperationExpression(operationType); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/Expression.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/Expression.java index 354396fc8..b808a602f 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/Expression.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/Expression.java @@ -8,6 +8,9 @@ * The interface Expression. */ public interface Expression extends Serializable { + /** + * The name of the JEXL context variable that holds the input stream operated on by expressions. + */ String BASE_STRING = "stream"; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/OperationExpression.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/OperationExpression.java index 3670c2ae7..907e35d93 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/OperationExpression.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/expression/OperationExpression.java @@ -4,14 +4,33 @@ * The Operation expression. */ public class OperationExpression implements Expression { + /** + * The trailing JEXL fragment that materialises the operated stream back into an array. + */ public static final String CONVERT_TO_ARRAY = ".toArray()"; + /** + * The JEXL expression string that this operation builds and exposes for evaluation. + */ private String expressionString; + /** + * {@inheritDoc} + * + * @return the JEXL expression string built for this operation + */ @Override public String getExpressionString() { return expressionString; } + /** + * {@inheritDoc} + * + *

Builds an operation expression by appending the operation chain for the given operation type + * to the base stream variable and converting the result back into an array. + * + * @param operationType the dot-separated operation chain to apply (for example {@code "distinct"}) + */ @Override public void createExpression(String operationType) { this.expressionString = BASE_STRING + getOperationExpression(operationType) + CONVERT_TO_ARRAY; diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayAggregateProcessor.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayAggregateProcessor.java index 215b15bd0..11a5542ec 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayAggregateProcessor.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayAggregateProcessor.java @@ -35,6 +35,15 @@ public ArrayAggregateProcessor(JexlEngine jexlEngine, JexlContext jexlContext, J super(jexlEngine, jexlContext, jexlScript, expression); } + /** + * {@inheritDoc} + * + *

Executes the configured JEXL aggregation script against the prepared context and unwraps any + * {@code Optional}-typed numeric result into a concrete value. + * + * @return the aggregation result, or {@code 0} when an empty optional numeric result is produced + * @throws ArrayAggregationException if the script execution fails + */ @Override public Object process() { try { @@ -45,6 +54,12 @@ public Object process() { } } + /** + * Unwraps an {@code OptionalDouble}, {@code OptionalInt}, or {@code OptionalLong} result into a concrete value. + * + * @param result the raw script execution result + * @return the unwrapped numeric value defaulting to {@code 0} when the optional is empty, or the original result when it is not optional + */ private Object getValueFromOptionalOutput(Object result) { if (result instanceof OptionalDouble) { return ((OptionalDouble) result).orElse(0); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayOperateProcessor.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayOperateProcessor.java index 46e1a4959..a3b252593 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayOperateProcessor.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayOperateProcessor.java @@ -31,6 +31,15 @@ public ArrayOperateProcessor(JexlEngine jexlEngine, JexlContext jexlContext, Jex super(jexlEngine, jexlContext, jexlScript, expression); } + /** + * {@inheritDoc} + * + *

Executes the configured JEXL operation script against the prepared context and returns the + * resulting array. + * + * @return the result produced by executing the operation script + * @throws ArrayOperateException if the script execution fails + */ @Override public Object process() { try { diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayProcessor.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayProcessor.java index 9ea495c68..7b61165f3 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayProcessor.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/scalar/longbow/array/processors/ArrayProcessor.java @@ -15,9 +15,21 @@ * The Abstract class of Array processor. */ public abstract class ArrayProcessor { + /** + * The JEXL context that holds the input stream variable consumed by the compiled script. + */ private JexlContext jexlContext; + /** + * The compiled JEXL script that is executed to produce the processing result. + */ private JexlScript jexlScript; + /** + * The JEXL engine used to compile expression strings into executable scripts. + */ private JexlEngine jexlEngine; + /** + * The expression that supplies the JEXL string describing the operation or aggregation to run. + */ private Expression expression; /** diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/OutlierMad.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/OutlierMad.java index 7b3dca156..1644da90e 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/OutlierMad.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/OutlierMad.java @@ -21,8 +21,20 @@ */ public class OutlierMad extends TableUdf> { + /** + * Logger used to record failures encountered while computing outliers, allowing the UDF to keep + * emitting rows instead of failing the job. + */ private static final Logger LOGGER = LoggerFactory.getLogger(OutlierMad.class.getName()); + /** + * Number of milliseconds in a single minute, used to convert minute-based window and observation + * lengths into timestamp arithmetic. + */ private static final long MILLI_SECONDS_IN_MINUTE = 60000; + /** + * Constant {@code 100} used to express the outlier ratio within the observation window as a + * percentage. + */ private static final double HUNDRED = 100D; /** @@ -54,6 +66,19 @@ public void eval(@DataTypeHint(value = "RAW", bridgedTo = ArrayList.class) Array } } + /** + * Sorts the points chronologically and marks which of them fall inside the observation period. + * + *

A point is flagged observable when its timestamp lies within the trailing observation window + * that ends at the close of the configured window, so that only recent points are considered when + * deciding whether the window contains outliers. + * + * @param windowStartTime the start time of the window + * @param points the points to order and classify + * @param windowLengthInMinutes the length of the window in minutes + * @param observationPeriodInMinutes the trailing period, in minutes, within which points are observable + * @return a chronologically ordered {@code ArrayList} with each point's observability set + */ private ArrayList getOrderedValues(Timestamp windowStartTime, ArrayList points, int windowLengthInMinutes, Integer observationPeriodInMinutes) { points.sort((p1, p2) -> (int) (p1.getTimestamp().getTime() - p2.getTimestamp().getTime())); ArrayList orderedValues = new ArrayList<>(Collections.nCopies(points.size(), Point.EMPTY_POINT)); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Mad.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Mad.java index d0b90187d..ff16f3c54 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Mad.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Mad.java @@ -17,8 +17,17 @@ * The Mad for OutlierMad udf. */ public class Mad { + /** + * Logger used to record failures encountered while computing the median absolute deviation. + */ private static final Logger LOGGER = LoggerFactory.getLogger(Mad.class.getName()); + /** + * The time-series points being analysed for outliers. + */ private List points; + /** + * Permitted deviation, expressed as a multiple of the MAD, beyond which a point is treated as an outlier. + */ private final Integer tolerance; /** @@ -55,6 +64,16 @@ public List getOutliers() { return points.stream().filter(Point::isOutlier).collect(Collectors.toList()); } + /** + * Computes and stores, for every point, its scaled distance from the median together with the + * tolerance-derived upper and lower bounds. + * + *

Points at or below the median use the lower-side MAD while points above the median use the + * upper-side MAD, supporting the double-MAD (asymmetric) variant of the algorithm. + * + * @param doubleMAD a two-element list holding the lower-side MAD at index {@code 0} and the + * upper-side MAD at index {@code 1} + */ private void setDistanceFromMAD(ArrayList doubleMAD) { Double median = getMedian(points.stream().map(Point::getValue).collect(Collectors.toList())); for (Point point : points) { @@ -66,6 +85,17 @@ private void setDistanceFromMAD(ArrayList doubleMAD) { } } + /** + * Computes the double (asymmetric) median absolute deviation around the median of the points. + * + *

Points are split into those at or below the median and those at or above it, and a MAD is + * computed for each side. + * + * @return a two-element {@code ArrayList} containing the lower-side MAD followed by the + * upper-side MAD + * @throws MadZeroException if either side has a MAD of zero, which makes outliers undetectable + * @throws MedianNotFound if the median cannot be computed because there are no values + */ private ArrayList getDoubleMAD() { ArrayList valuesLessThanMedian = new ArrayList<>(); ArrayList valuesGreaterThanMedian = new ArrayList<>(); @@ -86,6 +116,16 @@ private ArrayList getDoubleMAD() { return doubleMad; } + /** + * Computes the median absolute deviation (MAD) of the given points' values. + * + *

The MAD is the median of the absolute distances of each value from the values' median. + * + * @param points the points whose values the MAD is computed from + * @return the median absolute deviation of the supplied values + * @throws MadZeroException if the computed MAD is zero, in which case outliers cannot be detected + * @throws MedianNotFound if a median cannot be computed because the list is empty + */ private static Double getMAD(List points) { Double median = getMedian(points.stream().map(Point::getValue).collect(Collectors.toList())); List absoluteDistancesFromMedian = @@ -101,6 +141,13 @@ private static Double getMAD(List points) { } + /** + * Computes the absolute distance of each value from a reference value. + * + * @param values the values to measure + * @param value the reference value distances are measured from + * @return a list of absolute distances aligned with the input {@code values} + */ private static List getAbsoluteDistance(List values, Double value) { ArrayList absoluteDistances = new ArrayList<>(nCopies(values.size(), 0d)); for (int index = 0; index < values.size(); index++) { @@ -109,6 +156,15 @@ private static List getAbsoluteDistance(List values, Double valu return absoluteDistances; } + /** + * Computes the median of the supplied values, sorting the list in place. + * + *

For an even number of elements the mean of the two central values is returned. + * + * @param values the values to compute the median of; reordered in place by this call + * @return the median value + * @throws MedianNotFound if {@code values} is empty + */ private static Double getMedian(List values) { sort(values); int pointValueSize = values.size(); diff --git a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Point.java b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Point.java index db7609281..8ac7f4e13 100644 --- a/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Point.java +++ b/dagger-functions/src/main/java/com/gotocompany/dagger/functions/udfs/table/outlier/mad/Point.java @@ -6,14 +6,39 @@ * The Point for OutlierMad udf. */ public class Point { + /** + * The timestamp of this data point. + */ private final Timestamp timestamp; + /** + * The observed value of this data point. + */ private final Double value; + /** + * Whether this point falls inside the observation window and should be considered when detecting outliers. + */ private final boolean observable; + /** + * The value's distance from the median scaled by the median absolute deviation. + */ private Double distanceFromMad; + /** + * Whether this point has been classified as an outlier. + */ private boolean isOutlier; + /** + * The upper bound beyond which the value is considered an outlier. + */ private double upperBound; + /** + * The lower bound below which the value is considered an outlier. + */ private double lowerBound; + /** + * A reusable placeholder point with a {@code null} timestamp, a zero value and marked as not + * observable, used to pre-fill collections before the real points are computed. + */ public static final Point EMPTY_POINT = new Point(null, 0d, false); /**