From 1623fa2ec66fb075ef7845174fd4a4c9ddf78c91 Mon Sep 17 00:00:00 2001 From: Taylor Thurlow Date: Wed, 24 Jun 2020 22:37:33 -0700 Subject: [PATCH 1/7] Complete basic commit-graph header parsing --- src/thicket.cr | 4 ++ src/thicket/commit_graph_file.cr | 77 ++++++++++++++++++++++++++++++++ src/thicket/log.cr | 19 ++++++++ 3 files changed, 100 insertions(+) create mode 100644 src/thicket/commit_graph_file.cr diff --git a/src/thicket.cr b/src/thicket.cr index 36c5122..92c2ba4 100644 --- a/src/thicket.cr +++ b/src/thicket.cr @@ -22,6 +22,10 @@ module Thicket exit end + parser.on("-e", "--experimental", "Use true git graph parsing") do |v| + @@options[:experimental] = v + end + parser.on("-d", "--directory=DIRECTORY", "Path to the project directory") do |d| if d.nil? STDERR.puts "You must provide a project directory." diff --git a/src/thicket/commit_graph_file.cr b/src/thicket/commit_graph_file.cr new file mode 100644 index 0000000..054eb7c --- /dev/null +++ b/src/thicket/commit_graph_file.cr @@ -0,0 +1,77 @@ +# https://github.com/git/git/blob/master/Documentation/technical/commit-graph-format.txt +module Thicket + class CommitGraphFile + getter file_path : String + getter version : UInt8 + getter hash_version : UInt8 + getter num_chunks : UInt8 + getter num_base_commit_graphs : UInt8 + + def initialize(@file_path) + file = File.new(@file_path, "rb") + + # Header data + verify_header_signature(file) + @version = file.read_at(4, 1, &.read_byte).not_nil! + @hash_version = file.read_at(5, 1, &.read_byte).not_nil! + @num_chunks = file.read_at(6, 1, &.read_byte).not_nil! + @num_base_commit_graphs = file.read_at(7, 1, &.read_byte).not_nil! + + # Chunk data + chunk_table = [] of { signature: String, offset_bytes: UInt64 } + current_byte = 8 + loop do + chunk_signature = file.read_at(current_byte, 4, &.read_string(4)) + break if chunk_signature == "\0\0\0\0" + puts "Encountered signature: #{chunk_signature.inspect}" + + chunk_offset_bytes = begin + slice = Bytes.new(8) + file.read_at(current_byte + 4, 8, &.read(slice)) + slice.reverse! + slice.to_unsafe.as(UInt64*).value + end + + chunk_table << { signature: chunk_signature, offset_bytes: chunk_offset_bytes } + + current_byte += 12 + end + + pp chunk_table + + file.close + end + + private def verify_header_signature(file : File) + signature = file.read_at(0, 4, &.read_string(4)) + + if signature != "CGPH" + raise "Found unknown commit graph file header signature: #{signature}" + end + end + +# private def chunk_at(file : File, index : UInt8) : CommitGraphChunk +# +# # each chunk header is 12 bytes +# chunk_header_start : UInt32 = (index.to_u32 + 1) * 12 +# +# id : String = file.read_at(chunk_header_start.to_i32, 4, &.read_string(4)) +# puts id +# # raise "encountered terminating label" if id == 0 +# +# slice = Bytes.new(8) +# file.read_at(chunk_header_start.to_i32 + 4, 8, &.read(slice)) +# file_offset = slice.to_unsafe.as(UInt64*).value +# +# return CommitGraphChunk.new(id, file_offset, file) +# end + end + +# class CommitGraphChunk +# getter id : String +# getter file_offset : UInt64 +# +# def initialize(@id, @file_offset, file) +# end +# end +end diff --git a/src/thicket/log.cr b/src/thicket/log.cr index 6394c8f..b6cf026 100644 --- a/src/thicket/log.cr +++ b/src/thicket/log.cr @@ -1,6 +1,7 @@ require "file_utils" require "./time_measure" +require "./commit_graph_file" module Thicket class Log @@ -10,7 +11,25 @@ module Thicket @count_parsed = 0 end + def process_experimental + FileUtils.cd(git_working_directory) + + if File.exists?("./.git/objects/info/commit-graph") + puts "Detected single commit-graph file, no chain present." + cgf = CommitGraphFile.new("./.git/objects/info/commit-graph") + else + puts "Detected commit-graph chain." + chain = true + end + end + def print + if @options[:experimental] + process_experimental + + return + end + FileUtils.cd(git_working_directory) `#{git_log_command}`.split("\n").each do |l| puts process_git_log_line(l) From 789c8b2b911fc591ce0e5b685db633990cf23aac Mon Sep 17 00:00:00 2001 From: Taylor Thurlow Date: Sat, 27 Jun 2020 22:56:24 -0700 Subject: [PATCH 2/7] Add OID fanout parsing --- src/thicket/commit_graph_file.cr | 81 ++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/src/thicket/commit_graph_file.cr b/src/thicket/commit_graph_file.cr index 054eb7c..2372766 100644 --- a/src/thicket/commit_graph_file.cr +++ b/src/thicket/commit_graph_file.cr @@ -2,11 +2,14 @@ module Thicket class CommitGraphFile getter file_path : String + getter version : UInt8 getter hash_version : UInt8 getter num_chunks : UInt8 getter num_base_commit_graphs : UInt8 + getter num_commits : UInt32 + def initialize(@file_path) file = File.new(@file_path, "rb") @@ -18,12 +21,42 @@ module Thicket @num_base_commit_graphs = file.read_at(7, 1, &.read_byte).not_nil! # Chunk data - chunk_table = [] of { signature: String, offset_bytes: UInt64 } + contents = chunk_table_of_contents(file) + oid_fanout_index = contents.index { |c| c[:signature] == "OIDF" }.not_nil! + oid_fanout_offset = contents[oid_fanout_index][:offset] + next_offset = contents[oid_fanout_index + 1][:offset] + oid_fanout_length = next_offset - oid_fanout_offset + slice = Bytes.new(1024) + file.read_at(oid_fanout_offset.to_i32, oid_fanout_length.to_i32, &.read(slice)) + slice.reverse! + fanout = slice.each_slice(4) + .map { |integer_slice| integer_slice.to_unsafe.as(UInt32*).value } + .to_a + .reverse + + @num_commits = fanout.last + + puts "Total number of commits: #{@num_commits}" + + file.close + end + + private def verify_header_signature(file : File) + signature = file.read_at(0, 4, &.read_string(4)) + + if signature != "CGPH" + raise "Found unknown commit graph file header signature: #{signature}" + end + end + + private def chunk_table_of_contents(file : File) : Array({ signature: String, offset: UInt64 }) + contents = [] of { signature: String, offset: UInt64 } + current_byte = 8 + loop do chunk_signature = file.read_at(current_byte, 4, &.read_string(4)) break if chunk_signature == "\0\0\0\0" - puts "Encountered signature: #{chunk_signature.inspect}" chunk_offset_bytes = begin slice = Bytes.new(8) @@ -32,46 +65,24 @@ module Thicket slice.to_unsafe.as(UInt64*).value end - chunk_table << { signature: chunk_signature, offset_bytes: chunk_offset_bytes } + contents << { signature: chunk_signature, offset: chunk_offset_bytes } current_byte += 12 end - pp chunk_table - - file.close - end + if contents.none? { |c| c[:signature] == "OIDF" } + raise "Unable to find OID Fanout chunk in commit graph file." + end - private def verify_header_signature(file : File) - signature = file.read_at(0, 4, &.read_string(4)) + if contents.none? { |c| c[:signature] == "OIDL" } + raise "Unable to find OID Lookup chunk in commit graph file." + end - if signature != "CGPH" - raise "Found unknown commit graph file header signature: #{signature}" + if contents.none? { |c| c[:signature] == "CDAT" } + raise "Unable to find Commit Data chunk in commit graph file." end - end -# private def chunk_at(file : File, index : UInt8) : CommitGraphChunk -# -# # each chunk header is 12 bytes -# chunk_header_start : UInt32 = (index.to_u32 + 1) * 12 -# -# id : String = file.read_at(chunk_header_start.to_i32, 4, &.read_string(4)) -# puts id -# # raise "encountered terminating label" if id == 0 -# -# slice = Bytes.new(8) -# file.read_at(chunk_header_start.to_i32 + 4, 8, &.read(slice)) -# file_offset = slice.to_unsafe.as(UInt64*).value -# -# return CommitGraphChunk.new(id, file_offset, file) -# end + contents.sort_by { |c| c[:offset] } + end end - -# class CommitGraphChunk -# getter id : String -# getter file_offset : UInt64 -# -# def initialize(@id, @file_offset, file) -# end -# end end From 77d65eda79c54e30339a2f4c7fe3311e74513b67 Mon Sep 17 00:00:00 2001 From: Taylor Thurlow Date: Sat, 27 Jun 2020 23:49:16 -0700 Subject: [PATCH 3/7] Add OID Lookup chunk parsing --- src/thicket/commit_graph_file.cr | 40 ++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/src/thicket/commit_graph_file.cr b/src/thicket/commit_graph_file.cr index 2372766..8182fec 100644 --- a/src/thicket/commit_graph_file.cr +++ b/src/thicket/commit_graph_file.cr @@ -9,6 +9,7 @@ module Thicket getter num_base_commit_graphs : UInt8 getter num_commits : UInt32 + getter commit_oids : Array(String) def initialize(@file_path) file = File.new(@file_path, "rb") @@ -20,12 +21,16 @@ module Thicket @num_chunks = file.read_at(6, 1, &.read_byte).not_nil! @num_base_commit_graphs = file.read_at(7, 1, &.read_byte).not_nil! - # Chunk data + # OID Fanout contents = chunk_table_of_contents(file) + pp contents oid_fanout_index = contents.index { |c| c[:signature] == "OIDF" }.not_nil! oid_fanout_offset = contents[oid_fanout_index][:offset] - next_offset = contents[oid_fanout_index + 1][:offset] - oid_fanout_length = next_offset - oid_fanout_offset + oid_fanout_length = if contents[oid_fanout_index + 1]? + contents[oid_fanout_index + 1][:offset] - oid_fanout_offset + else + file.size - oid_fanout_offset + end slice = Bytes.new(1024) file.read_at(oid_fanout_offset.to_i32, oid_fanout_length.to_i32, &.read(slice)) slice.reverse! @@ -36,11 +41,38 @@ module Thicket @num_commits = fanout.last - puts "Total number of commits: #{@num_commits}" + # OID Lookup + oid_lookup_index = contents.index { |c| c[:signature] == "OIDL" }.not_nil! + oid_lookup_offset = contents[oid_lookup_index][:offset] + oid_lookup_length = if contents[oid_lookup_index + 1]? + contents[oid_lookup_index + 1][:offset] - oid_lookup_offset + else + file.size - oid_lookup_offset + end + slice = Bytes.new(@num_commits * commit_hash_length) + file.read_at(oid_lookup_offset.to_i32, oid_lookup_length.to_i32, &.read(slice)) + slice.reverse! + @commit_oids = [] of String + num_commits.times do |i| + start = i * commit_hash_length + subslice = slice[start, commit_hash_length] + @commit_oids << subslice.to_a.map { |b| sprintf("%02x", b) }.reverse.join + end + @commit_oids.reverse! file.close end + # The length of a full commit hash in bytes. + def commit_hash_length : UInt32 + case @hash_version + when 1 # SHA-1 + 20.to_u32 + else + raise "Unknown hash version identifier: #{@hash_version}" + end + end + private def verify_header_signature(file : File) signature = file.read_at(0, 4, &.read_string(4)) From f33b3e377ea59e1ff0022fe1898f250c6f1e45c8 Mon Sep 17 00:00:00 2001 From: Taylor Thurlow Date: Sun, 28 Jun 2020 00:01:25 -0700 Subject: [PATCH 4/7] Exclude trailer hash length when getting last chunk length --- src/thicket/commit_graph_file.cr | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/thicket/commit_graph_file.cr b/src/thicket/commit_graph_file.cr index 8182fec..ad40aca 100644 --- a/src/thicket/commit_graph_file.cr +++ b/src/thicket/commit_graph_file.cr @@ -29,7 +29,8 @@ module Thicket oid_fanout_length = if contents[oid_fanout_index + 1]? contents[oid_fanout_index + 1][:offset] - oid_fanout_offset else - file.size - oid_fanout_offset + # Exclude trailer hash if necessary + file.size - commit_hash_length - oid_fanout_offset end slice = Bytes.new(1024) file.read_at(oid_fanout_offset.to_i32, oid_fanout_length.to_i32, &.read(slice)) @@ -47,7 +48,8 @@ module Thicket oid_lookup_length = if contents[oid_lookup_index + 1]? contents[oid_lookup_index + 1][:offset] - oid_lookup_offset else - file.size - oid_lookup_offset + # Exclude trailer hash if necessary + file.size - commit_hash_length - oid_lookup_offset end slice = Bytes.new(@num_commits * commit_hash_length) file.read_at(oid_lookup_offset.to_i32, oid_lookup_length.to_i32, &.read(slice)) From 0b3685f1bc58ceceff14067d30c52a736f4c0c83 Mon Sep 17 00:00:00 2001 From: Taylor Thurlow Date: Sun, 28 Jun 2020 11:57:31 -0700 Subject: [PATCH 5/7] Do small refactor into separate chunk parsing methods --- src/thicket/commit_graph_file.cr | 97 ++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 37 deletions(-) diff --git a/src/thicket/commit_graph_file.cr b/src/thicket/commit_graph_file.cr index ad40aca..6414955 100644 --- a/src/thicket/commit_graph_file.cr +++ b/src/thicket/commit_graph_file.cr @@ -8,6 +8,7 @@ module Thicket getter num_chunks : UInt8 getter num_base_commit_graphs : UInt8 + getter oid_fanout : Array(UInt32) getter num_commits : UInt32 getter commit_oids : Array(String) @@ -21,46 +22,16 @@ module Thicket @num_chunks = file.read_at(6, 1, &.read_byte).not_nil! @num_base_commit_graphs = file.read_at(7, 1, &.read_byte).not_nil! - # OID Fanout contents = chunk_table_of_contents(file) pp contents - oid_fanout_index = contents.index { |c| c[:signature] == "OIDF" }.not_nil! - oid_fanout_offset = contents[oid_fanout_index][:offset] - oid_fanout_length = if contents[oid_fanout_index + 1]? - contents[oid_fanout_index + 1][:offset] - oid_fanout_offset - else - # Exclude trailer hash if necessary - file.size - commit_hash_length - oid_fanout_offset - end - slice = Bytes.new(1024) - file.read_at(oid_fanout_offset.to_i32, oid_fanout_length.to_i32, &.read(slice)) - slice.reverse! - fanout = slice.each_slice(4) - .map { |integer_slice| integer_slice.to_unsafe.as(UInt32*).value } - .to_a - .reverse - - @num_commits = fanout.last - # OID Lookup - oid_lookup_index = contents.index { |c| c[:signature] == "OIDL" }.not_nil! - oid_lookup_offset = contents[oid_lookup_index][:offset] - oid_lookup_length = if contents[oid_lookup_index + 1]? - contents[oid_lookup_index + 1][:offset] - oid_lookup_offset - else - # Exclude trailer hash if necessary - file.size - commit_hash_length - oid_lookup_offset - end - slice = Bytes.new(@num_commits * commit_hash_length) - file.read_at(oid_lookup_offset.to_i32, oid_lookup_length.to_i32, &.read(slice)) - slice.reverse! - @commit_oids = [] of String - num_commits.times do |i| - start = i * commit_hash_length - subslice = slice[start, commit_hash_length] - @commit_oids << subslice.to_a.map { |b| sprintf("%02x", b) }.reverse.join - end - @commit_oids.reverse! + @oid_fanout = parse_oid_fanout(file, contents) + @num_commits = @oid_fanout.last + puts "Number of commits: #{@num_commits}" + + @commit_oids = parse_oid_lookup(file, contents) + puts "First commit: #{@commit_oids.first[0..6]}" + puts " Last commit: #{@commit_oids.last[0..6]}" file.close end @@ -118,5 +89,57 @@ module Thicket contents.sort_by { |c| c[:offset] } end + + private def parse_oid_fanout( + file : File, + contents : Array({ signature: String, offset: UInt64 }) + ) : Array(UInt32) + oid_fanout_index = contents.index { |c| c[:signature] == "OIDF" }.not_nil! + oid_fanout_offset = contents[oid_fanout_index][:offset] + + oid_fanout_length = if contents[oid_fanout_index + 1]? + contents[oid_fanout_index + 1][:offset] - oid_fanout_offset + else + # Exclude trailer hash if necessary + file.size - commit_hash_length - oid_fanout_offset + end + + slice = Bytes.new(1024) + file.read_at(oid_fanout_offset.to_i32, oid_fanout_length.to_i32, &.read(slice)) + slice.reverse! + + slice.each_slice(4) + .map { |integer_slice| integer_slice.to_unsafe.as(UInt32*).value } + .to_a + .reverse + end + + private def parse_oid_lookup( + file : File, + contents : Array({ signature: String, offset: UInt64 }) + ) : Array(String) + oid_lookup_index = contents.index { |c| c[:signature] == "OIDL" }.not_nil! + oid_lookup_offset = contents[oid_lookup_index][:offset] + + oid_lookup_length = if contents[oid_lookup_index + 1]? + contents[oid_lookup_index + 1][:offset] - oid_lookup_offset + else + # Exclude trailer hash if necessary + file.size - commit_hash_length - oid_lookup_offset + end + + slice = Bytes.new(@num_commits * commit_hash_length) + file.read_at(oid_lookup_offset.to_i32, oid_lookup_length.to_i32, &.read(slice)) + slice.reverse! + + oids = Array.new(num_commits) do |i| + start = i * commit_hash_length + subslice = slice[start, commit_hash_length] + + subslice.to_a.map { |b| sprintf("%02x", b) }.reverse.join + end + + oids.reverse! + end end end From 4ff4fe394c5bf047cef155cf3cd263eeebd4faad Mon Sep 17 00:00:00 2001 From: Taylor Thurlow Date: Sun, 28 Jun 2020 12:19:44 -0700 Subject: [PATCH 6/7] Start implementing commit data chunk parsing --- src/thicket/commit_graph_file.cr | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/thicket/commit_graph_file.cr b/src/thicket/commit_graph_file.cr index 6414955..a73a1f3 100644 --- a/src/thicket/commit_graph_file.cr +++ b/src/thicket/commit_graph_file.cr @@ -11,6 +11,7 @@ module Thicket getter oid_fanout : Array(UInt32) getter num_commits : UInt32 getter commit_oids : Array(String) + getter commit_data : Array(CommitData) def initialize(@file_path) file = File.new(@file_path, "rb") @@ -33,6 +34,9 @@ module Thicket puts "First commit: #{@commit_oids.first[0..6]}" puts " Last commit: #{@commit_oids.last[0..6]}" + @commit_data = parse_commit_data(file, contents, @num_commits) + pp @commit_data.first + file.close end @@ -141,5 +145,60 @@ module Thicket oids.reverse! end + + private def parse_commit_data( + file : File, + contents : Array({ signature: String, offset: UInt64 }), + num_commits : UInt32, + ) : Array(CommitData) + commit_data_index = contents.index { |c| c[:signature] == "CDAT" }.not_nil! + commit_data_offset = contents[commit_data_index][:offset] + + commit_data_length = if contents[commit_data_index + 1]? + contents[commit_data_index + 1][:offset] - commit_data_offset + else + # Exclude trailer hash if necessary + file.size - commit_hash_length - commit_data_offset + end + + single_commit_data_size = commit_hash_length + 16 + slice = Bytes.new(@num_commits * single_commit_data_size) + file.read_at(commit_data_offset.to_i32, commit_data_length.to_i32, &.read(slice)) + slice.reverse! + + puts "Found #{slice.size} bytes of commit data." + + Array.new(num_commits) do |i| + subslice = slice[i, single_commit_data_size] + + root_tree_oid = subslice[0, commit_hash_length].to_a + .map { |b| sprintf("%02x", b) } + .reverse + .join + + first_parent_slice = subslice[commit_hash_length, 4] + first_parent_value = first_parent_slice.to_unsafe.as(UInt32*).value + first_parent = first_parent_value == 0x7000000 ? nil : first_parent_value + + second_parent_slice = subslice[commit_hash_length, 4] + second_parent_value = second_parent_slice.to_unsafe.as(UInt32*).value + second_parent = second_parent_value == 0x7000000 ? nil : second_parent_value + + CommitData.new( + root_tree_oid, + first_parent, + second_parent, + ) + end + end + + struct CommitData + getter root_tree_oid : String + getter first_parent : UInt32 | Nil + getter second_parent : UInt32 | Nil + + def initialize(@root_tree_oid, @first_parent, @second_parent) + end + end end end From 9409004716ebc4144cfaf96a713f3e18786d6c73 Mon Sep 17 00:00:00 2001 From: Taylor Thurlow Date: Wed, 5 Aug 2020 13:12:55 -0700 Subject: [PATCH 7/7] temp commit --- src/thicket/commit_graph_file.cr | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/thicket/commit_graph_file.cr b/src/thicket/commit_graph_file.cr index a73a1f3..efd42f3 100644 --- a/src/thicket/commit_graph_file.cr +++ b/src/thicket/commit_graph_file.cr @@ -35,6 +35,7 @@ module Thicket puts " Last commit: #{@commit_oids.last[0..6]}" @commit_data = parse_commit_data(file, contents, @num_commits) + pp @commit_oids.first pp @commit_data.first file.close @@ -162,32 +163,46 @@ module Thicket end single_commit_data_size = commit_hash_length + 16 - slice = Bytes.new(@num_commits * single_commit_data_size) + slice = Bytes.new(commit_data_length) file.read_at(commit_data_offset.to_i32, commit_data_length.to_i32, &.read(slice)) slice.reverse! puts "Found #{slice.size} bytes of commit data." Array.new(num_commits) do |i| - subslice = slice[i, single_commit_data_size] + subslice = slice[i * single_commit_data_size, single_commit_data_size] root_tree_oid = subslice[0, commit_hash_length].to_a .map { |b| sprintf("%02x", b) } .reverse .join - first_parent_slice = subslice[commit_hash_length, 4] + first_parent_start = commit_hash_length + first_parent_slice = subslice[first_parent_start, 4] first_parent_value = first_parent_slice.to_unsafe.as(UInt32*).value first_parent = first_parent_value == 0x7000000 ? nil : first_parent_value - second_parent_slice = subslice[commit_hash_length, 4] + second_parent_start = first_parent_start + 4 + second_parent_slice = subslice[second_parent_start, 4] second_parent_value = second_parent_slice.to_unsafe.as(UInt32*).value second_parent = second_parent_value == 0x7000000 ? nil : second_parent_value + generation_number_start = second_parent_start + 4 + # the generation number is only the higher 30 bits, not all 32 + generation_number_slice = subslice[generation_number_start, 4] + generation_number = generation_number_slice.to_unsafe.as(UInt32*).value & 0xFFFFFFFC + + # the commit time starts including the last two bits of the generation + # number subslice, and all 4 subsequent bytes + commit_time_slice = subslice[generation_number_start, 8] + commit_time = commit_time_slice.to_unsafe.as(UInt64*).value & 0x00000003FFFFFFFF + CommitData.new( root_tree_oid, first_parent, second_parent, + generation_number, + commit_time, ) end end @@ -196,8 +211,10 @@ module Thicket getter root_tree_oid : String getter first_parent : UInt32 | Nil getter second_parent : UInt32 | Nil + getter generation_number : UInt32 + getter commit_time : UInt64 - def initialize(@root_tree_oid, @first_parent, @second_parent) + def initialize(@root_tree_oid, @first_parent, @second_parent, @generation_number, @commit_time) end end end