author test works and bibl #324
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Process XML to JSON and HTML | |
on: | |
push: | |
branches: | |
- 'development_backup_cbss-bibls' # current data branch | |
# paths: | |
# - 'data/**' # Trigger only on changes to data files | |
permissions: | |
id-token: write | |
contents: read | |
jobs: | |
process_and_transform: | |
runs-on: ubuntu-latest | |
steps: | |
# Step 1: Check out the repositories | |
- name: Checkout repository | |
uses: actions/checkout@v3 | |
- name: Checkout syriaca repository (code repo) | |
uses: actions/checkout@v3 | |
with: | |
repository: srophe/syriaca | |
ref: staticSite | |
path: syriaca # Check it out into a subfolder | |
# Step 2: Install Java and Saxon for XSLT | |
- name: Set up JDK 11 | |
uses: actions/setup-java@v3 | |
with: | |
java-version: '11' | |
distribution: 'temurin' | |
- name: Download Saxon from GitHub | |
run: | | |
wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar | |
# Step 3: | |
- name: Configure AWS credentials from AWS account | |
uses: aws-actions/configure-aws-credentials@v2 | |
with: | |
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }} | |
aws-region: us-east-1 | |
role-session-name: GitHub-OIDC-data | |
# Step 4: Find updated XML files | |
# - name: Identify updated XML files | |
# run: | | |
# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml') | |
# echo "Updated XML files: $UPDATED_FILES" | |
# echo "::set-output name=updated_files::$UPDATED_FILES" | |
# id: files | |
# Step 4: Identify 25 XML files for testing | |
# - name: Identify first 25 XML files | |
# run: | | |
# find ./data/persons/tei -name '*.xml' | head -n 25 > xml_files.txt | |
# echo "Processing 25 XML files:" | |
# cat xml_files.txt | |
- name: Specify specific XML files and 25 per category | |
run: | | |
# find ./data/persons/tei -name '*.xml' | head -n 25 > xml_files.txt | |
# find ./data/places/tei -name '*.xml' | head -n 25 >> xml_files.txt | |
find ./data/works/tei -name '*.xml' | head -n 25 > xml_files.txt | |
find ./data/bibl/tei -name '*.xml' | head -n 25 >> xml_files.txt | |
# find ./data/subjects/tei -name '*.xml' | head -n 25 >> xml_files.txt | |
# echo "./data/persons/tei/25.xml" >> xml_files.txt | |
# echo "./data/persons/tei/110.xml" >> xml_files.txt | |
# echo "./data/persons/tei/106.xml" >> xml_files.txt | |
# echo "./data/persons/tei/109.xml" >> xml_files.txt | |
# echo "./data/persons/tei/101.xml" >> xml_files.txt | |
# echo "./data/persons/tei/100.xml" >> xml_files.txt | |
# echo "./data/persons/tei/102.xml" >> xml_files.txt | |
# echo "./data/persons/tei/1021.xml" >> xml_files.txt | |
# echo "./data/persons/tei/320.xml" >> xml_files.txt | |
# echo "./data/persons/tei/67.xml" >> xml_files.txt | |
# echo "./data/persons/tei/544.xml" >> xml_files.txt | |
# echo "./data/persons/tei/732.xml" >> xml_files.txt | |
# echo "./data/places/tei/10.xml" >> xml_files.txt | |
# echo "./data/places/tei/78.xml" >> xml_files.txt | |
# echo "./data/places/tei/1507.xml" >> xml_files.txt | |
# echo "./data/places/tei/1486.xml" >> xml_files.txt | |
# echo "./data/places/tei/104.xml" >> xml_files.txt | |
# echo "./data/places/tei/602.xml" >> xml_files.txt | |
echo "./data/works/tei/315.xml" >> xml_files.txt | |
echo "./data/works/tei/9501.xml" >> xml_files.txt | |
echo "./data/works/tei/nhsl/tei/9723.xml" >> xml_files.txt | |
echo "./data/works/tei/nhsl/tei/9724.xml" >> xml_files.txt | |
echo "./data/works/tei/10510.xml" >> xml_files.txt | |
echo "./data/works/tei/nhsl/tei/10511.xml" >> xml_files.txt | |
echo "Processing specified XML files:" | |
cat xml_files.txt | |
# Step 5: Run XSLT Transformations and Merge into Single JSON | |
- name: Run XSLT Transformations and Create Bulk JSON | |
run: | | |
touch bulk_data.json # Create the bulk JSON file | |
# Commented out code in this section for possible optimization of code | |
# mkdir -p data-html | |
# echo "Created HTML directory" | |
while IFS= read -r file; do | |
echo "Processing $file" | |
# Extract the document type from the file path | |
type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl') | |
if [ "$type" == "bibl" ]; then | |
type="cbss" | |
fi | |
echo "Json type $type" | |
# Extract the filename and create the index header for OpenSearch bulk format | |
filename=$(basename ${file%.xml}) | |
echo "Processing $filename for JSON" | |
printf "{\"index\":{\"_index\":\"syriaca-index-5\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json | |
# Apply XSLT for JSON conversion and append it to bulk_data.json directly | |
java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl docType="$type" | tr -d '\n' >> bulk_data.json | |
echo "" >> bulk_data.json # Add a newline after the document entry | |
# Apply XSLT for HTML conversion and capture any error | |
# java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:${filename}.html 2>&1 | tee saxon_error.log | |
# # Upload the HTML file to S3 | |
# aws s3 cp $(basename ${file%.xml}.html) s3://srophe-syriaca-front-end/${type}/${filename}.html | |
done < xml_files.txt | |
env: | |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
# Step 6: Convert HTML files | |
# - name: Create static HTML directory | |
# run: | | |
# mkdir -p data-html | |
# echo "Created HTML directory" | |
# - name: Run XSLT Transformations for HTML | |
# run: | | |
# while IFS= read -r file; do | |
# echo "Processing $file for HTML" | |
# # Extract the document type from the file path | |
# type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1) | |
# # Extract the filename and create the index header for OpenSearch bulk format | |
# filename=$(basename ${file%.xml}) | |
# echo "html filename: $filename" | |
# # if [ "$type" == "bibl" ]; then | |
# # type="cbss" | |
# # fi | |
# echo "HTML type $type" | |
# # Run the XSLT transformation located in the root of syriaca-data repository | |
# java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${type}/${filename}.html | |
# # java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${filename}.html | |
# done < xml_files.txt | |
# Step 7: Upload files to S3 | |
- name: Upload JSON file to S3 | |
run: | | |
aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/advancedsearchfields/bulk_data_author_index_5.json | |
env: | |
AWS_REGION: ${{ secrets.AWS_REGION }} | |
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
# - name: Upload HTML files to S3 | |
# run: | | |
# for html_file in $(find ./data-html -name "*.html"); do | |
# type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1) | |
# echo "html_file $html_file" | |
# if [ "$type" == "subject" ]; then | |
# type="taxonomy" | |
# fi | |
# if [ "$type" == "bibl" ]; then | |
# type="cbss" | |
# fi | |
# # Copy html file to S3 with the idno path | |
# aws s3 cp $html_file s3://srophe-syriaca-front-end/${type}/$(basename ${html_file%.html}) | |
# done | |
# env: | |
# AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} | |
# AWS_REGION: ${{ secrets.AWS_REGION }} | |
# Step 8: Upload JSON data to OpenSearch | |
- name: JSON file to OpenSearch | |
env: | |
OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }} | |
OPENSEARCH_USER: ${{ secrets.OPENSEARCH_USER }} | |
OPENSEARCH_PASSWORD: ${{ secrets.OPENSEARCH_PASSWORD }} | |
run: | | |
RESPONSE=$(curl -s -o response.json -w "%{http_code}" -XPOST "$OPENSEARCH_URL/_bulk" \ | |
-H "Content-Type: application/json" \ | |
-u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \ | |
--data-binary "@bulk_data.json") | |
echo "HTTP response code: $RESPONSE" | |
cat response.json | |
# Check for errors in the response | |
if grep -q '"errors":true' response.json; then | |
echo "Errors occurred during bulk upload" | |
exit 1 | |
fi | |