-
Notifications
You must be signed in to change notification settings - Fork 360
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Table Extractor Hook and _lakefs_tables format (#6589)
- Loading branch information
1 parent
772649d
commit e036ddf
Showing
5 changed files
with
287 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
local pathlib = require("path") | ||
local utils = require("lakefs/catalogexport/internal") | ||
local strings = require("strings") | ||
local DEFAULT_PAGE_SIZE = 30 | ||
|
||
-- extract partition prefix from full path | ||
local function extract_partitions_path(partitions, path) | ||
if partitions == nil or #partitions == 0 then | ||
return "" | ||
end | ||
local idx = 1 | ||
local is_partition_prefix = strings.has_prefix(path, partitions[1]) | ||
for part_idx, partition in ipairs(partitions) do | ||
local col_substr = "/" .. partition .. "=" | ||
-- if partition is the path prefix and we are the that first partition remove / | ||
if part_idx == 1 and is_partition_prefix then | ||
col_substr = partition .. "=" | ||
end | ||
local i, j = string.find(path, col_substr, idx) | ||
if i == nil then | ||
return nil | ||
end | ||
local separator_idx = string.find(path, "/", j+1) | ||
-- verify / found and there is something in between = ... / | ||
if separator_idx == nil or separator_idx <= (j + 1) then | ||
return nil | ||
end | ||
idx = separator_idx | ||
end | ||
return string.sub(path, 1, idx) | ||
end | ||
|
||
-- Hive format partition iterator each result set is a collection of files under the same partition | ||
local function extract_partition_pager(client, repo_id, commit_id, base_path, partition_cols, page_size) | ||
local target_partition = "" | ||
local pager = utils.lakefs_object_pager(client, repo_id, commit_id, "", base_path,"", page_size or DEFAULT_PAGE_SIZE) | ||
local page = pager() | ||
return function() | ||
if page == nil then | ||
return nil | ||
end | ||
local partition_entries = {} | ||
while true do | ||
if #page == 0 then | ||
page = pager() | ||
if page == nil then -- no more records | ||
return target_partition, partition_entries | ||
end | ||
end | ||
local entry = page[1] | ||
if not pathlib.is_hidden(entry.path) then | ||
local partition_key = extract_partitions_path(partition_cols, entry.path) | ||
-- first time: if not set, assign current object partition as the target_partition key | ||
if target_partition == "" then | ||
target_partition = partition_key | ||
end | ||
-- break if current entry does not belong to the target_partition | ||
if partition_key ~= target_partition then | ||
local partition_result = target_partition | ||
target_partition = partition_key | ||
return partition_result, partition_entries | ||
end | ||
table.insert(partition_entries, { | ||
physical_address = entry.physical_address, | ||
path = entry.path, | ||
size = entry.size_bytes, | ||
checksum = entry.checksum | ||
}) | ||
-- remove entry only if its part of the current partition | ||
table.remove(page, 1) | ||
end | ||
end | ||
end | ||
end | ||
|
||
return { | ||
extract_partition_pager=extract_partition_pager, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
local DEFAULT_SHORT_DIGEST_LEN=6 | ||
|
||
local function short_digest(digest, len) | ||
return digest:sub(1, len or DEFAULT_SHORT_DIGEST_LEN) | ||
end | ||
|
||
-- paginate lakefs api | ||
local function lakefs_paginiated_api(api_call, after) | ||
local next_offset = after | ||
local has_more = true | ||
return function() | ||
if not has_more then | ||
return nil | ||
end | ||
local code, resp = api_call(next_offset) | ||
if code < 200 or code >= 300 then | ||
error("lakeFS: api return non-2xx" .. tostring(code)) | ||
end | ||
has_more = resp.pagination.has_more | ||
next_offset = resp.pagination.next_offset | ||
return resp.results | ||
end | ||
end | ||
|
||
-- paginage over lakefs objects | ||
local function lakefs_object_pager(lakefs_client, repo_id, commit_id, after, prefix, delimiter, page_size) | ||
return lakefs_paginiated_api(function(next_offset) | ||
return lakefs_client.list_objects(repo_id, commit_id, next_offset, prefix, delimiter, page_size or 30) | ||
end, after) | ||
end | ||
|
||
-- resolve ref value from action global, used as part of setting default table name | ||
local function ref_from_branch_or_tag(action_info) | ||
local event = action_info.event_type | ||
if event == "pre-create-tag" or event == "post-create-tag" then | ||
return action_info.tag_id | ||
elseif event == "pre-create-branch" or event == "post-create-branch" or "post-commit" or "post-merge" then | ||
return action_info.branch_id | ||
else | ||
error("unsupported event type: " .. action_info.event_type) | ||
end | ||
end | ||
|
||
return { | ||
short_digest=short_digest, | ||
ref_from_branch_or_tag=ref_from_branch_or_tag, | ||
lakefs_object_pager=lakefs_object_pager, | ||
lakefs_paginiated_api=lakefs_paginiated_api, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
local pathlib = require("path") | ||
local strings = require("strings") | ||
local yaml = require("encoding/yaml") | ||
local utils = require("lakefs/catalogexport/internal") | ||
|
||
local LAKEFS_TABLES_BASE = "_lakefs_tables/" | ||
|
||
-- check if lakefs entry is a table spec under _lakefs_tables/ | ||
local function is_table_obj(entry, tables_base) | ||
if entry.path_type ~= "object" then | ||
return false | ||
end | ||
local path = entry.path | ||
if strings.has_prefix(path, tables_base) then | ||
-- remove _lakefs_tables/ from path | ||
path = entry.path:sub(#tables_base, #path) | ||
end | ||
return not pathlib.is_hidden(path) and strings.has_suffix(path, ".yaml") | ||
end | ||
|
||
-- list all YAML files under _lakefs_tables/* | ||
local function list_table_descriptor_entries(client, repo_id, commit_id) | ||
local table_entries = {} | ||
local page_size = 30 | ||
local pager = utils.lakefs_object_pager(client, repo_id, commit_id, "", LAKEFS_TABLES_BASE,"", page_size) | ||
for entries in pager do | ||
for _, entry in ipairs(entries) do | ||
if is_table_obj(entry, LAKEFS_TABLES_BASE) then | ||
table.insert(table_entries, { | ||
physical_address = entry.physical_address, | ||
path = entry.path | ||
}) | ||
end | ||
end | ||
end | ||
return table_entries | ||
end | ||
|
||
-- table as parsed YAML object | ||
local function get_table_descriptor(client, repo_id, commit_id, logical_path) | ||
local code, content = client.get_object(repo_id, commit_id, logical_path) | ||
if code ~= 200 then | ||
error("could not fetch data file: HTTP " .. tostring(code)) | ||
end | ||
local descriptor = yaml.unmarshal(content) | ||
descriptor.partition_columns = descriptor.partition_columns or {} | ||
return descriptor | ||
end | ||
|
||
return { | ||
list_table_descriptor_entries = list_table_descriptor_entries, | ||
get_table_descriptor = get_table_descriptor, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters