-
Notifications
You must be signed in to change notification settings - Fork 360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Export Hooks: Symlink Exporter #6636
Changes from 6 commits
59cc86d
8937b53
1bb972d
990c1e7
18c683c
2c1c726
f61b402
4931400
c483336
0e273a4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
local extractor = require("lakefs/catalogexport/table_extractor") | ||
local hive = require("lakefs/catalogexport/hive") | ||
local utils = require("lakefs/catalogexport/internal") | ||
local url = require("net/url") | ||
local pathlib = require("path") | ||
local strings = require("strings") | ||
|
||
-- make it a function so that @nopcoder can ask why we bloat the code? | ||
local function parse_storage_uri(uri) | ||
local parsed_uri = url.parse(uri) | ||
local storage = parsed_uri.scheme | ||
local bucket = parsed_uri.host | ||
local key = parsed_uri.path | ||
if key ~= nil then | ||
key = key:sub(2) -- remove the leading slash | ||
end | ||
return storage, bucket, key | ||
end | ||
|
||
--[[ | ||
### Default Symlink File(s) structure: | ||
|
||
${storageNamespace} | ||
_lakefs/ | ||
exported/ | ||
${ref}/ | ||
${commitId}/ | ||
nopcoder marked this conversation as resolved.
Show resolved
Hide resolved
|
||
${tableName}/ | ||
p1=v1/symlink.txt | ||
p1=v2/symlink.txt | ||
p1=v3/symlink.txt | ||
... | ||
]] | ||
local function gen_storage_uri_prefix(storage_ns, commit_id, action_info) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
local branch_or_tag = utils.ref_from_branch_or_tag(action_info) | ||
return pathlib.join("/", storage_ns, "_lakefs/exported", branch_or_tag, utils.short_digest(commit_id), "/") | ||
end | ||
|
||
local function new_std_table_writer(export_base_uri) | ||
local storage, bucket, key_prefix = parse_storage_uri(export_base_uri) | ||
return function(suffix_key, data) | ||
local key = pathlib.join("/", key_prefix, suffix_key) | ||
print("[STD Writer Put Object] Bucket " .. bucket .. " key: " .. key .. " content: \n" .. data) | ||
end | ||
end | ||
|
||
local function new_s3_table_writer(client, export_base_uri) | ||
local storage, bucket, key_prefix = parse_storage_uri(export_base_uri) | ||
return function(suffix_key, data) | ||
local key = pathlib.join("/", key_prefix, suffix_key) | ||
client.put_object(bucket, key, data) | ||
print("S3 Put: bucket: " .. bucket .. " key: " .. key) | ||
end | ||
end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume these are two types of callbacks that we can pass as
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Nope, they return the callbacks that has the same signature writer = exporter.new_s3_table_writer(s3, action, args.override_export_uri)
exporter.export(lakefs, repo_id, commit_id, table_path, writer, trim_obj_base_path) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no more writer callback |
||
|
||
local function export(lakefs_client, repo_id, commit_id, table_src_path, blockstore_write, trim_obj_base_path) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed it as we discussed! |
||
local descriptor = extractor.get_table_descriptor(lakefs_client, repo_id, commit_id, table_src_path) | ||
if descriptor.type == "hive" then | ||
local base_path = descriptor.path | ||
local pager = hive.extract_partition_pager(lakefs_client, repo_id, commit_id, base_path, | ||
descriptor.partition_columns) | ||
local sha = utils.short_digest(commit_id) | ||
print(string.format('%s table `lakefs://%s/%s/%s`', descriptor.type, repo_id, sha, descriptor.path)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. debug print? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nope, it's log that I believe is helpful feedback in the Action logs. Thoughts? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added debug flag |
||
for part_key, entries in pager do | ||
local symlink_data = "" | ||
for _, entry in ipairs(entries) do | ||
symlink_data = symlink_data .. entry.physical_address .. "\n" | ||
end | ||
-- create key suffix for symlink file | ||
local storage_key_suffix = part_key | ||
if #descriptor.partition_columns == 0 then | ||
storage_key_suffix = pathlib.join("/", descriptor.name, "symlink.txt") | ||
else | ||
if trim_obj_base_path then | ||
storage_key_suffix = strings.replace(part_key, base_path .. "/", "", 1) -- remove base_path prefix from partition path | ||
end | ||
-- append to partition path to suffix | ||
storage_key_suffix = pathlib.join("/", descriptor.name, storage_key_suffix, "symlink.txt") | ||
end | ||
-- write symlink file to blockstore | ||
blockstore_write(storage_key_suffix, symlink_data) | ||
end | ||
else | ||
error("table " .. descriptor.type .. " in path " .. table_src_path .. " not supported") | ||
end | ||
end | ||
|
||
return { | ||
export = export, | ||
new_s3_table_writer = new_s3_table_writer, | ||
new_std_table_writer = new_std_table_writer, | ||
gen_storage_uri_prefix = gen_storage_uri_prefix | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, you are bloating the code - please don't get table and assign each field to local just to return them. if you are not doing anything with a field just use the member, I don't need to meet another local and remember what it does.
also, you can see that the writter of url.parse already return a struct/table, and the beauty of the members describe each field - the current function split them into local variables in which we loose this marbel.
from the go code path can't be nil and we need to match it with the separator before we drop it.
suggest the following code + internal.lua
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💯