Skip to content

bibl cbs subject

bibl cbs subject #318

Workflow file for this run

name: Process XML to JSON and HTML
on:
push:
branches:
- 'development_backup_cbss-bibls' # current data branch
# paths:
# - 'data/**' # Trigger only on changes to data files
permissions:
id-token: write
contents: read
jobs:
process_and_transform:
runs-on: ubuntu-latest
steps:
# Step 1: Check out the repositories
- name: Checkout repository
uses: actions/checkout@v3
- name: Checkout syriaca repository (code repo)
uses: actions/checkout@v3
with:
repository: srophe/syriaca
ref: staticSite
path: syriaca # Check it out into a subfolder
# Step 2: Install Java and Saxon for XSLT
- name: Set up JDK 11
uses: actions/setup-java@v3
with:
java-version: '11'
distribution: 'temurin'
- name: Download Saxon from GitHub
run: |
wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar
# Step 3:
- name: Configure AWS credentials from AWS account
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
aws-region: us-east-1
role-session-name: GitHub-OIDC-data
# Step 4: Find updated XML files
# - name: Identify updated XML files
# run: |
# UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml')
# echo "Updated XML files: $UPDATED_FILES"
# echo "::set-output name=updated_files::$UPDATED_FILES"
# id: files
# Step 4: Identify 25 XML files for testing
# - name: Identify first 25 XML files
# run: |
# find ./data/persons/tei -name '*.xml' | head -n 25 > xml_files.txt
# echo "Processing 25 XML files:"
# cat xml_files.txt
- name: Specify specific XML files and 25 per category
run: |
# find ./data/persons/tei -name '*.xml' | head -n 25 > xml_files.txt
# find ./data/places/tei -name '*.xml' | head -n 25 >> xml_files.txt
# find ./data/works/tei -name '*.xml' | head -n 25 >> xml_files.txt
find ./data/bibl/tei -name '*.xml' | head -n 25 >> xml_files.txt
# find ./data/subjects/tei -name '*.xml' | head -n 25 >> xml_files.txt
# echo "./data/persons/tei/25.xml" > xml_files.txt
# echo "./data/persons/tei/110.xml" >> xml_files.txt
# echo "./data/persons/tei/106.xml" >> xml_files.txt
# echo "./data/persons/tei/109.xml" >> xml_files.txt
# echo "./data/persons/tei/101.xml" >> xml_files.txt
# echo "./data/persons/tei/100.xml" >> xml_files.txt
# echo "./data/persons/tei/102.xml" >> xml_files.txt
# echo "./data/persons/tei/1021.xml" >> xml_files.txt
# echo "./data/persons/tei/320.xml" >> xml_files.txt
# echo "./data/persons/tei/67.xml" >> xml_files.txt
# echo "./data/persons/tei/544.xml" >> xml_files.txt
# echo "./data/persons/tei/732.xml" >> xml_files.txt
# echo "./data/places/tei/10.xml" >> xml_files.txt
# echo "./data/places/tei/78.xml" >> xml_files.txt
# echo "./data/places/tei/1507.xml" >> xml_files.txt
# echo "./data/places/tei/1486.xml" >> xml_files.txt
# echo "./data/places/tei/104.xml" >> xml_files.txt
# echo "./data/places/tei/602.xml" >> xml_files.txt
# echo "./data/works/tei/315.xml" >> xml_files.txt
# echo "./data/works/tei/9501.xml" >> xml_files.txt
# echo "./data/works/tei/nhsl/tei/9723.xml" >> xml_files.txt
# echo "./data/works/tei/nhsl/tei/9724.xml" >> xml_files.txt
# echo "./data/works/tei/10510.xml" >> xml_files.txt
# echo "./data/works/tei/nhsl/tei/10511.xml" >> xml_files.txt
# echo "Processing specified XML files:"
cat xml_files.txt
# Step 5: Run XSLT Transformations and Merge into Single JSON
- name: Run XSLT Transformations and Create Bulk JSON
run: |
touch bulk_data.json # Create the bulk JSON file
# Commented out code in this section for possible optimization of code
# mkdir -p data-html
# echo "Created HTML directory"
while IFS= read -r file; do
echo "Processing $file"
# Extract the document type from the file path
type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl')
if [ "$type" == "bibl" ]; then
type="cbss"
fi
echo "Json type $type"
# Extract the filename and create the index header for OpenSearch bulk format
filename=$(basename ${file%.xml})
echo "Processing $filename for JSON"
printf "{\"index\":{\"_index\":\"syriaca-index-4\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json
# Apply XSLT for JSON conversion and append it to bulk_data.json directly
java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl docType="$type" | tr -d '\n' >> bulk_data.json
echo "" >> bulk_data.json # Add a newline after the document entry
# Apply XSLT for HTML conversion and capture any error
# java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:${filename}.html 2>&1 | tee saxon_error.log
# # Upload the HTML file to S3
# aws s3 cp $(basename ${file%.xml}.html) s3://srophe-syriaca-front-end/${type}/${filename}.html
done < xml_files.txt
env:
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
AWS_REGION: ${{ secrets.AWS_REGION }}
# Step 6: Convert HTML files
# - name: Create static HTML directory
# run: |
# mkdir -p data-html
# echo "Created HTML directory"
# - name: Run XSLT Transformations for HTML
# run: |
# while IFS= read -r file; do
# echo "Processing $file for HTML"
# # Extract the document type from the file path
# type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1)
# # Extract the filename and create the index header for OpenSearch bulk format
# filename=$(basename ${file%.xml})
# echo "html filename: $filename"
# # if [ "$type" == "bibl" ]; then
# # type="cbss"
# # fi
# echo "HTML type $type"
# # Run the XSLT transformation located in the root of syriaca-data repository
# java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${type}/${filename}.html
# # java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${filename}.html
# done < xml_files.txt
# Step 7: Upload files to S3
- name: Upload JSON file to S3
run: |
aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/advancedsearchfields/bulk_data_bibl_index_4.json
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
# - name: Upload HTML files to S3
# run: |
# for html_file in $(find ./data-html -name "*.html"); do
# type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1)
# echo "html_file $html_file"
# if [ "$type" == "subject" ]; then
# type="taxonomy"
# fi
# if [ "$type" == "bibl" ]; then
# type="cbss"
# fi
# # Copy html file to S3 with the idno path
# aws s3 cp $html_file s3://srophe-syriaca-front-end/${type}/$(basename ${html_file%.html})
# done
# env:
# AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
# AWS_REGION: ${{ secrets.AWS_REGION }}
# Step 8: Upload JSON data to OpenSearch
- name: JSON file to OpenSearch
env:
OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}
OPENSEARCH_USER: ${{ secrets.OPENSEARCH_USER }}
OPENSEARCH_PASSWORD: ${{ secrets.OPENSEARCH_PASSWORD }}
run: |
RESPONSE=$(curl -s -o response.json -w "%{http_code}" -XPOST "$OPENSEARCH_URL/_bulk" \
-H "Content-Type: application/json" \
-u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \
--data-binary "@bulk_data.json")
echo "HTTP response code: $RESPONSE"
cat response.json
# Check for errors in the response
if grep -q '"errors":true' response.json; then
echo "Errors occurred during bulk upload"
exit 1
fi