|  | 
|  | 1 | +/* | 
|  | 2 | + * Licensed to the Apache Software Foundation (ASF) under one | 
|  | 3 | + * or more contributor license agreements.  See the NOTICE file | 
|  | 4 | + * distributed with this work for additional information | 
|  | 5 | + * regarding copyright ownership.  The ASF licenses this file | 
|  | 6 | + * to you under the Apache License, Version 2.0 (the | 
|  | 7 | + * "License"); you may not use this file except in compliance | 
|  | 8 | + * with the License.  You may obtain a copy of the License at | 
|  | 9 | + * | 
|  | 10 | + *   http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 11 | + * | 
|  | 12 | + * Unless required by applicable law or agreed to in writing, | 
|  | 13 | + * software distributed under the License is distributed on an | 
|  | 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
|  | 15 | + * KIND, either express or implied.  See the License for the | 
|  | 16 | + * specific language governing permissions and limitations | 
|  | 17 | + * under the License. | 
|  | 18 | + */ | 
|  | 19 | + | 
|  | 20 | +package org.apache.hudi.table.action.commit; | 
|  | 21 | + | 
|  | 22 | +import org.apache.hudi.common.engine.HoodieEngineContext; | 
|  | 23 | +import org.apache.hudi.common.model.HoodieKey; | 
|  | 24 | +import org.apache.hudi.common.model.HoodieRecordLocation; | 
|  | 25 | +import org.apache.hudi.common.model.WriteOperationType; | 
|  | 26 | +import org.apache.hudi.common.util.Option; | 
|  | 27 | +import org.apache.hudi.common.util.ValidationUtils; | 
|  | 28 | +import org.apache.hudi.common.util.collection.Pair; | 
|  | 29 | +import org.apache.hudi.config.HoodieWriteConfig; | 
|  | 30 | +import org.apache.hudi.exception.HoodieException; | 
|  | 31 | +import org.apache.hudi.index.bucket.BucketIdentifier; | 
|  | 32 | +import org.apache.hudi.index.bucket.HoodieBucketIndex; | 
|  | 33 | +import org.apache.hudi.table.HoodieTable; | 
|  | 34 | +import org.apache.hudi.table.WorkloadProfile; | 
|  | 35 | +import org.apache.hudi.table.WorkloadStat; | 
|  | 36 | + | 
|  | 37 | +import java.util.ArrayList; | 
|  | 38 | +import java.util.Collections; | 
|  | 39 | +import java.util.HashMap; | 
|  | 40 | +import java.util.HashSet; | 
|  | 41 | +import java.util.List; | 
|  | 42 | +import java.util.Map; | 
|  | 43 | +import java.util.Set; | 
|  | 44 | + | 
|  | 45 | +import static org.apache.hudi.common.model.WriteOperationType.INSERT_OVERWRITE; | 
|  | 46 | +import static org.apache.hudi.common.model.WriteOperationType.INSERT_OVERWRITE_TABLE; | 
|  | 47 | + | 
|  | 48 | +/** | 
|  | 49 | + * Packs incoming records to be inserted into buckets (1 bucket = 1 partition) for Java engine. | 
|  | 50 | + * Equivalent to SparkBucketIndexPartitioner. | 
|  | 51 | + * TODO: Reduce duplicate code between Spark and Java version. | 
|  | 52 | + */ | 
|  | 53 | +public class JavaBucketIndexPartitioner implements JavaPartitioner { | 
|  | 54 | + | 
|  | 55 | +  private final int numBuckets; | 
|  | 56 | +  private final String indexKeyField; | 
|  | 57 | +  private final int totalPartitionPaths; | 
|  | 58 | +  private final List<String> partitionPaths; | 
|  | 59 | +  /** | 
|  | 60 | +   * Helps get the partition id, partition id is partition offset + bucket id. | 
|  | 61 | +   * The partition offset is a multiple of the bucket num. | 
|  | 62 | +   */ | 
|  | 63 | +  private final Map<String, Integer> partitionPathOffset; | 
|  | 64 | +  private final boolean isOverwrite; | 
|  | 65 | + | 
|  | 66 | +  /** | 
|  | 67 | +   * Partition path and file groups in it pair. Decide the file group an incoming update should go to. | 
|  | 68 | +   */ | 
|  | 69 | +  private final Map<String, Set<String>> updatePartitionPathFileIds; | 
|  | 70 | + | 
|  | 71 | +  /** | 
|  | 72 | +   * Bucket number to bucket info mapping. | 
|  | 73 | +   */ | 
|  | 74 | +  private final Map<Integer, BucketInfo> bucketInfoMap; | 
|  | 75 | + | 
|  | 76 | +  private final boolean isNonBlockingConcurrencyControl; | 
|  | 77 | + | 
|  | 78 | +  public JavaBucketIndexPartitioner(WorkloadProfile profile, | 
|  | 79 | +                                    HoodieEngineContext context, | 
|  | 80 | +                                    HoodieTable table, | 
|  | 81 | +                                    HoodieWriteConfig config) { | 
|  | 82 | +    if (!(table.getIndex() instanceof HoodieBucketIndex)) { | 
|  | 83 | +      throw new HoodieException( | 
|  | 84 | +          "Bucket index partitioner should only be used by BucketIndex other than " | 
|  | 85 | +              + table.getIndex().getClass().getSimpleName()); | 
|  | 86 | +    } | 
|  | 87 | +    this.numBuckets = ((HoodieBucketIndex) table.getIndex()).getNumBuckets(); | 
|  | 88 | +    this.indexKeyField = config.getBucketIndexHashField(); | 
|  | 89 | +    this.totalPartitionPaths = profile.getPartitionPaths().size(); | 
|  | 90 | +    this.partitionPaths = new ArrayList<>(profile.getPartitionPaths()); | 
|  | 91 | +    this.partitionPathOffset = new HashMap<>(); | 
|  | 92 | +    int i = 0; | 
|  | 93 | +    for (Object partitionPath : profile.getPartitionPaths()) { | 
|  | 94 | +      partitionPathOffset.put(partitionPath.toString(), i); | 
|  | 95 | +      i += numBuckets; | 
|  | 96 | +    } | 
|  | 97 | +    this.updatePartitionPathFileIds = new HashMap<>(); | 
|  | 98 | +    this.bucketInfoMap = new HashMap<>(); | 
|  | 99 | +    assignUpdates(profile); | 
|  | 100 | +    WriteOperationType operationType = profile.getOperationType(); | 
|  | 101 | +    this.isOverwrite = INSERT_OVERWRITE.equals(operationType) || INSERT_OVERWRITE_TABLE.equals(operationType); | 
|  | 102 | +    this.isNonBlockingConcurrencyControl = config.isNonBlockingConcurrencyControl(); | 
|  | 103 | + | 
|  | 104 | +    if (isOverwrite) { | 
|  | 105 | +      ValidationUtils.checkArgument(!isNonBlockingConcurrencyControl, | 
|  | 106 | +          "Insert overwrite is not supported with non-blocking concurrency control"); | 
|  | 107 | +    } | 
|  | 108 | +  } | 
|  | 109 | + | 
|  | 110 | +  private void assignUpdates(WorkloadProfile profile) { | 
|  | 111 | +    // Each update location gets tracked | 
|  | 112 | +    Set<Map.Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap() | 
|  | 113 | +        .entrySet(); | 
|  | 114 | +    for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) { | 
|  | 115 | +      if (!updatePartitionPathFileIds.containsKey(partitionStat.getKey())) { | 
|  | 116 | +        updatePartitionPathFileIds.put(partitionStat.getKey(), new HashSet<>()); | 
|  | 117 | +      } | 
|  | 118 | +      for (Map.Entry<String, Pair<String, Long>> updateLocEntry : | 
|  | 119 | +          partitionStat.getValue().getUpdateLocationToCount().entrySet()) { | 
|  | 120 | +        updatePartitionPathFileIds.get(partitionStat.getKey()).add(updateLocEntry.getKey()); | 
|  | 121 | +      } | 
|  | 122 | +    } | 
|  | 123 | +  } | 
|  | 124 | + | 
|  | 125 | +  @Override | 
|  | 126 | +  public int getNumPartitions() { | 
|  | 127 | +    return totalPartitionPaths * numBuckets; | 
|  | 128 | +  } | 
|  | 129 | + | 
|  | 130 | +  @Override | 
|  | 131 | +  public int getPartition(Object key) { | 
|  | 132 | +    Pair<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Pair<HoodieKey, Option<HoodieRecordLocation>>) key; | 
|  | 133 | +    String partitionPath = keyLocation.getLeft().getPartitionPath(); | 
|  | 134 | +    Option<HoodieRecordLocation> location = keyLocation.getRight(); | 
|  | 135 | +    int bucketId = location.isPresent() | 
|  | 136 | +        ? BucketIdentifier.bucketIdFromFileId(location.get().getFileId()) | 
|  | 137 | +        : BucketIdentifier.getBucketId(keyLocation.getLeft().getRecordKey(), indexKeyField, numBuckets); | 
|  | 138 | +    return partitionPathOffset.get(partitionPath) + bucketId; | 
|  | 139 | +  } | 
|  | 140 | + | 
|  | 141 | +  public BucketInfo getBucketInfo(int bucketNumber) { | 
|  | 142 | +    return bucketInfoMap.computeIfAbsent(bucketNumber, k -> { | 
|  | 143 | +      int bucketId = bucketNumber % numBuckets; | 
|  | 144 | +      String partitionPath = partitionPaths.get(bucketNumber / numBuckets); | 
|  | 145 | +      return getBucketInfo(bucketId, partitionPath); | 
|  | 146 | +    }); | 
|  | 147 | +  } | 
|  | 148 | + | 
|  | 149 | +  protected BucketInfo getBucketInfo(int bucketId, String partitionPath) { | 
|  | 150 | +    String bucketIdStr = BucketIdentifier.bucketIdStr(bucketId); | 
|  | 151 | +    // Insert overwrite always generates new bucket file id | 
|  | 152 | +    if (isOverwrite) { | 
|  | 153 | +      return new BucketInfo(BucketType.INSERT, BucketIdentifier.newBucketFileIdPrefix(bucketIdStr), partitionPath); | 
|  | 154 | +    } | 
|  | 155 | +    Option<String> fileIdOption = Option.fromJavaOptional(updatePartitionPathFileIds | 
|  | 156 | +        .getOrDefault(partitionPath, Collections.emptySet()).stream() | 
|  | 157 | +        .filter(e -> e.startsWith(bucketIdStr)) | 
|  | 158 | +        .findFirst()); | 
|  | 159 | +    if (fileIdOption.isPresent()) { | 
|  | 160 | +      return new BucketInfo(BucketType.UPDATE, fileIdOption.get(), partitionPath); | 
|  | 161 | +    } else { | 
|  | 162 | +      // Always write into log file instead of base file if using NB-CC | 
|  | 163 | +      if (isNonBlockingConcurrencyControl) { | 
|  | 164 | +        return new BucketInfo(BucketType.UPDATE, BucketIdentifier.newBucketFileIdForNBCC(bucketIdStr), partitionPath); | 
|  | 165 | +      } | 
|  | 166 | +      return new BucketInfo(BucketType.INSERT, BucketIdentifier.newBucketFileIdPrefix(bucketIdStr), partitionPath); | 
|  | 167 | +    } | 
|  | 168 | +  } | 
|  | 169 | + | 
|  | 170 | +  @Override | 
|  | 171 | +  public List<String> getSmallFileIds() { | 
|  | 172 | +    return Collections.emptyList(); | 
|  | 173 | +  } | 
|  | 174 | +} | 
0 commit comments