Skip to content

Commit

Permalink
collecting all the swhid dirs
Browse files Browse the repository at this point in the history
  • Loading branch information
Mazztok45 committed Oct 23, 2024
1 parent ab6c036 commit 1b40834
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 0 deletions.
49 changes: 49 additions & 0 deletions src/zbmath_rest2oai/collection_swhid_dir_ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import requests
import re
import pandas as pd

import os
import pandas as pd

# Current script's directory
current_dir = os.path.dirname(os.path.abspath(__file__))

# Navigate up two levels from the 'src/zbmath_rest2oai' directory to the project root
root_dir = os.path.abspath(os.path.join(current_dir, "../../"))

# Construct the desired file path
file_path = os.path.join(root_dir, "test/data/software/swh_swmath.csv")

df = pd.read_csv(file_path)

def get_swhid_dir(snaphot_url):
req=requests.get(snaphot_url)

# Decode the byte content to a string (assuming the content is UTF-8 encoded)
html_content = req.content.decode('utf-8')

# Use regex to find the object_id (in this case, directory identifier)
match = re.search(r'swh:1:dir:([a-f0-9]+)', html_content)

if match:
object_id = match.group(1)
print(f"Object ID: {object_id}")
return "swh:1:dir:"+object_id.split("Object ID: ")[0]
else:
print("Object ID not found")
return 0


list_swhid_dir=list()
for snapshot in df.swhid:
#print(snapshot)
#print(snapshot.split("swh:1:snp:"))
url="https://archive.softwareheritage.org/browse/snapshot/{}/directory/".format(snapshot.split("swh:1:snp:")[1])
print(url)
swhid_dir=get_swhid_dir(url)
list_swhid_dir.append(swhid_dir)


df["swhid_dir"]= list_swhid_dir
df.write(os.path.join(root_dir, "test/data/software/swh_swmath_swhid_dir.csv"))

9 changes: 9 additions & 0 deletions src/zbmath_rest2oai/state.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"software": 47831,
"document": 0,
"software_perf": {
"records": 45500,
"time_rest": 626.5267510000003,
"time_oai": 116.78700248885434
}
}

0 comments on commit 1b40834

Please sign in to comment.