-
Notifications
You must be signed in to change notification settings - Fork 2.4k
feat: Addition of virtual _hoodie_commit_completion_time column which is used for Incremental Queries #14037
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
846560d
3260aa4
726b93d
76f093b
a60096f
5ef7a9b
349f4fd
8e60409
320b578
9e8a01e
6690dc9
c0e8ee8
cc57195
0ee7b8f
18a686b
826acce
b4ce002
ca001d5
e8674ec
e29faf2
f70ac82
8573ba8
a3094ae
11cd775
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
import org.apache.hudi.common.table.HoodieTableVersion; | ||
import org.apache.hudi.common.table.HoodieTableMetaClient; | ||
import org.apache.hudi.common.table.read.buffer.PositionBasedFileGroupRecordBuffer; | ||
import org.apache.hudi.common.table.timeline.HoodieInstant; | ||
import org.apache.hudi.common.util.InternalSchemaCache; | ||
import org.apache.hudi.common.util.Option; | ||
import org.apache.hudi.common.util.VisibleForTesting; | ||
|
@@ -55,6 +56,7 @@ | |
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
import static org.apache.avro.JsonProperties.NULL_VALUE; | ||
import static org.apache.hudi.avro.AvroSchemaUtils.appendFieldsToSchemaDedupNested; | ||
import static org.apache.hudi.avro.AvroSchemaUtils.createNewSchemaFromFieldsWithReference; | ||
import static org.apache.hudi.avro.AvroSchemaUtils.findNestedField; | ||
|
@@ -85,6 +87,9 @@ public class FileGroupReaderSchemaHandler<T> { | |
protected final TypedProperties properties; | ||
private final DeleteContext deleteContext; | ||
private final HoodieTableMetaClient metaClient; | ||
private final boolean shouldAddCompletionTime; | ||
private final Map<String, String> commitTimeToCompletionTimeMap; | ||
private final Schema requestedSchemaWithCompletionTime; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the completion time is required, then the requested schema should be updated to include the completion time. I don't think we need a second instance variable. |
||
|
||
public FileGroupReaderSchemaHandler(HoodieReaderContext<T> readerContext, | ||
Schema tableSchema, | ||
|
@@ -98,10 +103,24 @@ public FileGroupReaderSchemaHandler(HoodieReaderContext<T> readerContext, | |
this.requestedSchema = AvroSchemaCache.intern(requestedSchema); | ||
this.hoodieTableConfig = metaClient.getTableConfig(); | ||
this.deleteContext = new DeleteContext(properties, tableSchema); | ||
this.metaClient = metaClient; | ||
|
||
boolean hasInstantRange = readerContext.getInstantRange().isPresent(); | ||
boolean shouldAddCompletionTimeField = !metaClient.isMetadataTable() | ||
&& metaClient.getTableConfig() != null && metaClient.getTableConfig().getTableVersion() != null | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The table config and version should always be non-null so we can simplify this |
||
&& metaClient.getTableConfig().getTableVersion().greaterThanOrEquals(HoodieTableVersion.SIX) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The completion time is only available in version 8 and above if I remember correctly |
||
&& hasInstantRange; | ||
|
||
this.shouldAddCompletionTime = shouldAddCompletionTimeField; | ||
this.requestedSchemaWithCompletionTime = shouldAddCompletionTimeField | ||
? addCompletionTimeField(this.requestedSchema) | ||
: this.requestedSchema; | ||
this.commitTimeToCompletionTimeMap = this.shouldAddCompletionTime | ||
? buildCompletionTimeMapping(metaClient) | ||
: Collections.emptyMap(); | ||
this.requiredSchema = AvroSchemaCache.intern(prepareRequiredSchema(this.deleteContext)); | ||
this.internalSchema = pruneInternalSchema(requiredSchema, internalSchemaOpt); | ||
this.internalSchemaOpt = getInternalSchemaOpt(internalSchemaOpt); | ||
this.metaClient = metaClient; | ||
} | ||
|
||
public Schema getTableSchema() { | ||
|
@@ -125,12 +144,63 @@ public Option<InternalSchema> getInternalSchemaOpt() { | |
} | ||
|
||
public Option<UnaryOperator<T>> getOutputConverter() { | ||
if (!AvroSchemaUtils.areSchemasProjectionEquivalent(requiredSchema, requestedSchema)) { | ||
return Option.of(readerContext.getRecordContext().projectRecord(requiredSchema, requestedSchema)); | ||
Schema targetSchema = shouldAddCompletionTime ? requestedSchemaWithCompletionTime : requestedSchema; | ||
UnaryOperator<T> projectionConverter = null; | ||
UnaryOperator<T> completionTimeConverter = null; | ||
boolean schemasEquivalent = AvroSchemaUtils.areSchemasProjectionEquivalent(requiredSchema, targetSchema); | ||
if (!schemasEquivalent) { | ||
projectionConverter = readerContext.getRecordContext().projectRecord(requiredSchema, targetSchema); | ||
} | ||
if (shouldAddCompletionTime) { | ||
completionTimeConverter = getCompletionTimeTransformer(); | ||
} | ||
if (projectionConverter != null && completionTimeConverter != null) { | ||
final UnaryOperator<T> finalProjectionConverter = projectionConverter; | ||
final UnaryOperator<T> finalCompletionTimeConverter = completionTimeConverter; | ||
UnaryOperator<T> composed = t -> finalCompletionTimeConverter.apply(finalProjectionConverter.apply(t)); | ||
return Option.of(composed); | ||
} else if (projectionConverter != null) { | ||
return Option.of(projectionConverter); | ||
} else if (completionTimeConverter != null) { | ||
return Option.of(completionTimeConverter); | ||
} | ||
return Option.empty(); | ||
} | ||
|
||
private UnaryOperator<T> getCompletionTimeTransformer() { | ||
return record -> { | ||
try { | ||
Object commitTimeObj = readerContext.getRecordContext().getValue( | ||
record, | ||
requestedSchemaWithCompletionTime, | ||
HoodieRecord.COMMIT_TIME_METADATA_FIELD | ||
); | ||
if (commitTimeObj == null) { | ||
return record; | ||
} | ||
String commitTime = commitTimeObj.toString(); | ||
String completionTime = commitTimeToCompletionTimeMap.getOrDefault(commitTime, commitTime); | ||
Schema.Field completionTimeField = requestedSchemaWithCompletionTime.getField(HoodieRecord.COMMIT_COMPLETION_TIME_METADATA_FIELD); | ||
if (completionTimeField == null) { | ||
return record; | ||
} | ||
int completionTimePos = completionTimeField.pos(); | ||
Object[] fieldValues = new Object[requestedSchemaWithCompletionTime.getFields().size()]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to simply set the value in the existing object instead of creating a new one? At this point the record should have a null for the completion time based on my understanding of the code. |
||
for (int i = 0; i < fieldValues.length; i++) { | ||
if (i == completionTimePos) { | ||
fieldValues[i] = completionTime; | ||
} else { | ||
Schema.Field field = requestedSchemaWithCompletionTime.getFields().get(i); | ||
fieldValues[i] = readerContext.getRecordContext().getValue(record, requestedSchemaWithCompletionTime, field.name()); | ||
} | ||
} | ||
return readerContext.getRecordContext().constructEngineRecord(requestedSchemaWithCompletionTime, fieldValues); | ||
} catch (Exception e) { | ||
return record; | ||
} | ||
}; | ||
} | ||
|
||
public DeleteContext getDeleteContext() { | ||
return deleteContext; | ||
} | ||
|
@@ -172,12 +242,24 @@ Schema generateRequiredSchema(DeleteContext deleteContext) { | |
boolean hasInstantRange = readerContext.getInstantRange().isPresent(); | ||
//might need to change this if other queries than mor have mandatory fields | ||
if (!readerContext.getHasLogFiles()) { | ||
List<Schema.Field> addedFields = new ArrayList<>(); | ||
if (hasInstantRange && !findNestedField(requestedSchema, HoodieRecord.COMMIT_TIME_METADATA_FIELD).isPresent()) { | ||
List<Schema.Field> addedFields = new ArrayList<>(); | ||
addedFields.add(getField(this.tableSchema, HoodieRecord.COMMIT_TIME_METADATA_FIELD)); | ||
return appendFieldsToSchemaDedupNested(requestedSchema, addedFields); | ||
} | ||
return requestedSchema; | ||
if (shouldAddCompletionTime && !findNestedField(requestedSchemaWithCompletionTime, HoodieRecord.COMMIT_COMPLETION_TIME_METADATA_FIELD).isPresent()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the expectation is that the completion time is a top level field, not nested, so we can simplify this |
||
Schema unionSchema = Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)); | ||
Schema.Field completionTimeField = new Schema.Field( | ||
HoodieRecord.COMMIT_COMPLETION_TIME_METADATA_FIELD, | ||
unionSchema, | ||
"Completion time of the commit", | ||
NULL_VALUE | ||
); | ||
addedFields.add(completionTimeField); | ||
} | ||
if (!addedFields.isEmpty()) { | ||
return appendFieldsToSchemaDedupNested(requestedSchemaWithCompletionTime, addedFields); | ||
} | ||
return requestedSchemaWithCompletionTime; | ||
} | ||
|
||
if (hoodieTableConfig.getRecordMergeMode() == RecordMergeMode.CUSTOM) { | ||
|
@@ -190,16 +272,30 @@ Schema generateRequiredSchema(DeleteContext deleteContext) { | |
for (String field : getMandatoryFieldsForMerging( | ||
hoodieTableConfig, this.properties, this.tableSchema, readerContext.getRecordMerger(), | ||
deleteContext.hasBuiltInDeleteField(), deleteContext.getCustomDeleteMarkerKeyValue(), hasInstantRange)) { | ||
if (!findNestedField(requestedSchema, field).isPresent()) { | ||
if (!findNestedField(requestedSchemaWithCompletionTime, field).isPresent()) { | ||
addedFields.add(getField(this.tableSchema, field)); | ||
} | ||
} | ||
|
||
if (hasInstantRange && !findNestedField(requestedSchemaWithCompletionTime, HoodieRecord.COMMIT_TIME_METADATA_FIELD).isPresent()) { | ||
addedFields.add(getField(this.tableSchema, HoodieRecord.COMMIT_TIME_METADATA_FIELD)); | ||
} | ||
if (shouldAddCompletionTime && !findNestedField(requestedSchemaWithCompletionTime, HoodieRecord.COMMIT_COMPLETION_TIME_METADATA_FIELD).isPresent()) { | ||
Schema unionSchema = Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)); | ||
Schema.Field completionTimeField = new Schema.Field( | ||
HoodieRecord.COMMIT_COMPLETION_TIME_METADATA_FIELD, | ||
unionSchema, | ||
"Completion time of the commit", | ||
NULL_VALUE | ||
); | ||
addedFields.add(completionTimeField); | ||
} | ||
|
||
if (addedFields.isEmpty()) { | ||
return requestedSchema; | ||
return requestedSchemaWithCompletionTime; | ||
} | ||
|
||
return appendFieldsToSchemaDedupNested(requestedSchema, addedFields); | ||
return appendFieldsToSchemaDedupNested(requestedSchemaWithCompletionTime, addedFields); | ||
} | ||
|
||
private static String[] getMandatoryFieldsForMerging(HoodieTableConfig cfg, | ||
|
@@ -308,4 +404,26 @@ private static Schema.Field getField(Schema schema, String fieldName) { | |
} | ||
return foundFieldOpt.get(); | ||
} | ||
|
||
private Map<String, String> buildCompletionTimeMapping(HoodieTableMetaClient metaClient) { | ||
return metaClient.getCommitsTimeline().filterCompletedInstants().getInstants().stream() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does not account for instants that are no longer in the active timeline. We'll need to use the |
||
.collect(Collectors.toMap( | ||
HoodieInstant::requestedTime, | ||
instant -> instant.getCompletionTime() != null ? instant.getCompletionTime() : instant.requestedTime() | ||
)); | ||
} | ||
|
||
private Schema addCompletionTimeField(Schema schema) { | ||
if (findNestedField(schema, HoodieRecord.COMMIT_COMPLETION_TIME_METADATA_FIELD).isPresent()) { | ||
return schema; | ||
} | ||
Schema unionSchema = Schema.createUnion(Schema.create(Schema.Type.NULL), Schema.create(Schema.Type.STRING)); | ||
Schema.Field completionTimeField = new Schema.Field( | ||
HoodieRecord.COMMIT_COMPLETION_TIME_METADATA_FIELD, | ||
unionSchema, | ||
"Completion time of the commit", | ||
NULL_VALUE | ||
); | ||
return appendFieldsToSchemaDedupNested(schema, Collections.singletonList(completionTimeField)); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The metadata fields are typically persisted to the files. In this case it is just a field we add at query time so maybe we can come up with a better name. You called this a
virtual
field in the description so maybe something along those lines?