Skip to content

Commit

Permalink
[improvement](statistics)Add log for estimating hive table row count …
Browse files Browse the repository at this point in the history
…logic. (apache#42921)

Add log for estimating hive table row count logic to help investigate
user issues.
  • Loading branch information
Jibing-Li authored Oct 31, 2024
1 parent 9a907b0 commit dd03546
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ public long getCachedRowCount(long catalogId, long dbId, long tableId) {
if (f.isDone()) {
return f.get().orElse(-1L);
}
LOG.info("Row count for table {}.{}.{} is still processing.", catalogId, dbId, tableId);
} catch (Exception e) {
LOG.warn("Unexpected exception while returning row count", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ public long fetchRowCount() {
long rowCount = getRowCountFromExternalSource();
// Only hive table supports estimate row count by listing file.
if (rowCount == -1 && dlaType.equals(DLAType.HIVE)) {
LOG.debug("Will estimate row count from file list.");
LOG.info("Will estimate row count for table {} from file list.", name);
rowCount = getRowCountFromFileList();
}
return rowCount;
Expand Down Expand Up @@ -837,14 +837,16 @@ private long getRowCountFromFileList() {
return -1;
}
if (isView()) {
LOG.info("Table {} is view, return 0.", name);
return 0;
}
HiveMetaStoreCache.HivePartitionValues partitionValues = getAllPartitionValues();

// Get files for all partitions.
int samplePartitionSize = Config.hive_stats_partition_sample_size;
List<HiveMetaStoreCache.FileCacheValue> filesByPartitions = getFilesForPartitions(partitionValues,
samplePartitionSize);
List<HiveMetaStoreCache.FileCacheValue> filesByPartitions =
getFilesForPartitions(partitionValues, samplePartitionSize);
LOG.info("Number of files selected for hive table {} is {}", name, filesByPartitions.size());
long totalSize = 0;
// Calculate the total file size.
for (HiveMetaStoreCache.FileCacheValue files : filesByPartitions) {
Expand All @@ -863,14 +865,20 @@ private long getRowCountFromFileList() {
estimatedRowSize += column.getDataType().getSlotSize();
}
if (estimatedRowSize == 0) {
LOG.warn("Table {} estimated size is 0, return 0.", name);
return 0;
}

int totalPartitionSize = partitionValues == null ? 1 : partitionValues.getIdToPartitionItem().size();
if (samplePartitionSize != 0 && samplePartitionSize < totalPartitionSize) {
LOG.info("Table {} sampled {} of {} partitions, sampled size is {}",
name, samplePartitionSize, totalPartitionSize, totalSize);
totalSize = totalSize * totalPartitionSize / samplePartitionSize;
}
return totalSize / estimatedRowSize;
long rows = totalSize / estimatedRowSize;
LOG.info("Table {} rows {}, total size is {}, estimatedRowSize is {}",
name, rows, totalSize, estimatedRowSize);
return rows;
}

// Get all partition values from cache.
Expand All @@ -888,6 +896,12 @@ private HiveMetaStoreCache.HivePartitionValues getAllPartitionValues() {
// no need to worry that this call will invalid or refresh the cache.
// because it has enough space to keep partition info of all tables in cache.
partitionValues = cache.getPartitionValues(dbName, name, partitionColumnTypes);
if (partitionValues == null || partitionValues.getPartitionNameToIdMap() == null) {
LOG.warn("Partition values for hive table {} is null", name);
} else {
LOG.info("Partition values size for hive table {} is {}",
name, partitionValues.getPartitionNameToIdMap().size());
}
}
return partitionValues;
}
Expand Down Expand Up @@ -923,13 +937,19 @@ private List<HiveMetaStoreCache.FileCacheValue> getFilesForPartitions(
// get partitions without cache, so that it will not invalid the cache when executing
// non query request such as `show table status`
hivePartitions = cache.getAllPartitionsWithoutCache(dbName, name, partitionValuesList);
LOG.info("Partition list size for hive partition table {} is {}", name, hivePartitions.size());
} else {
hivePartitions.add(new HivePartition(dbName, name, true,
getRemoteTable().getSd().getInputFormat(),
getRemoteTable().getSd().getLocation(), null, Maps.newHashMap()));
}
// Get files for all partitions.
String bindBrokerName = catalog.bindBrokerName();
if (LOG.isDebugEnabled()) {
for (HivePartition partition : hivePartitions) {
LOG.debug("Chosen partition for table {}. [{}]", name, partition.toString());
}
}
return cache.getFilesByPartitionsWithoutCache(hivePartitions, bindBrokerName);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -604,11 +604,14 @@ public static long getIcebergRowCount(ExternalCatalog catalog, String dbName, St
.getIcebergTable(catalog, dbName, tbName);
Snapshot snapshot = icebergTable.currentSnapshot();
if (snapshot == null) {
LOG.info("Iceberg table {}.{}.{} is empty, return row count 0.", catalog.getName(), dbName, tbName);
// empty table
return 0;
}
Map<String, String> summary = snapshot.summary();
return Long.parseLong(summary.get(TOTAL_RECORDS)) - Long.parseLong(summary.get(TOTAL_POSITION_DELETES));
long rows = Long.parseLong(summary.get(TOTAL_RECORDS)) - Long.parseLong(summary.get(TOTAL_POSITION_DELETES));
LOG.info("Iceberg table {}.{}.{} row count in summary is {}", catalog.getName(), dbName, tbName, rows);
return rows;
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ public static long getHiveRowCount(HMSExternalTable table) {
long rows = Long.parseLong(parameters.get(NUM_ROWS));
// Sometimes, the NUM_ROWS in hms is 0 but actually is not. Need to check TOTAL_SIZE if NUM_ROWS is 0.
if (rows != 0) {
LOG.info("Get row count {} for hive table {} in table parameters.", rows, table.getName());
return rows;
}
}
Expand All @@ -639,9 +640,13 @@ public static long getHiveRowCount(HMSExternalTable table) {
estimatedRowSize += column.getDataType().getSlotSize();
}
if (estimatedRowSize == 0) {
LOG.warn("Hive table {} estimated row size is invalid {}", table.getName(), estimatedRowSize);
return -1;
}
return totalSize / estimatedRowSize;
long rows = totalSize / estimatedRowSize;
LOG.info("Get row count {} for hive table {} by total size {} and row size {}",
rows, table.getName(), totalSize, estimatedRowSize);
return rows;
}

/**
Expand Down

0 comments on commit dd03546

Please sign in to comment.