.github/workflows/main.yml

name: Process XML to JSON and HTML

on:
  push:
    branches:
      - 'development_backup_cbss-bibls'  # current data branch
    # paths:
    #   - 'data/**'  # Trigger only on changes to data files   
      
permissions:
  id-token: write
  contents: read
  
jobs:
  process_and_transform:
    runs-on: ubuntu-latest
    steps:
      # Step 1: Check out the repositories
      - name: Checkout repository
        uses: actions/checkout@v3

      - name: Checkout syriaca repository (code repo)
        uses: actions/checkout@v3
        with:
          repository: srophe/syriaca
          ref: staticSite
          path: syriaca  # Check it out into a subfolder        

      # Step 2: Install Java and Saxon for XSLT 
      - name: Set up JDK 11
        uses: actions/setup-java@v3
        with:
          java-version: '11'
          distribution: 'temurin'
          
      - name: Download Saxon from GitHub
        run: |
          wget https://repo1.maven.org/maven2/net/sf/saxon/Saxon-HE/10.6/Saxon-HE-10.6.jar -O saxon.jar

      # Step 3: 
      - name: Configure AWS credentials from AWS account
        uses: aws-actions/configure-aws-credentials@v2
        with:
          role-to-assume: ${{ secrets.AWS_SROPHE_ROLE }}
          aws-region: us-east-1
          role-session-name: GitHub-OIDC-data          
          
          
      # Step 4: Find updated XML files
      # - name: Identify updated XML files
      #   run: |
      #     UPDATED_FILES=$(git diff --name-only HEAD~1 HEAD | grep '.xml')
      #     echo "Updated XML files: $UPDATED_FILES"
      #     echo "::set-output name=updated_files::$UPDATED_FILES"
      #   id: files
      
      # Step 4: Identify 25 XML files for testing
      # - name: Identify first 25 XML files 
      #   run: |
      #     find ./data/persons/tei -name '*.xml' | head -n 25 > xml_files.txt
      #     echo "Processing 25 XML files:"
      #     cat xml_files.txt

      
      - name: Specify specific XML files and 25 per category 
        run: |
          # find ./data/persons/tei -name '*.xml' | head -n 25 > xml_files.txt
          # find ./data/places/tei -name '*.xml' | head -n 25 >> xml_files.txt
          # find ./data/works/tei -name '*.xml' | head -n 25 >> xml_files.txt
          # find ./data/bibl/tei -name '*.xml' | head -n 25 >> xml_files.txt
          # find ./data/subjects/tei -name '*.xml' | head -n 25 >> xml_files.txt

          # echo "./data/persons/tei/25.xml" > xml_files.txt
          # echo "./data/persons/tei/110.xml" >> xml_files.txt
          # echo "./data/persons/tei/106.xml" >> xml_files.txt
          # echo "./data/persons/tei/109.xml" >> xml_files.txt
          # echo "./data/persons/tei/101.xml" >> xml_files.txt
          # echo "./data/persons/tei/100.xml" >> xml_files.txt
          # echo "./data/persons/tei/102.xml" >> xml_files.txt
          # echo "./data/persons/tei/1021.xml" >> xml_files.txt
          # echo "./data/persons/tei/320.xml" >> xml_files.txt
          # echo "./data/persons/tei/67.xml" >> xml_files.txt
          # echo "./data/persons/tei/544.xml" >> xml_files.txt
          # echo "./data/persons/tei/732.xml" >> xml_files.txt
          # echo "./data/places/tei/10.xml" >> xml_files.txt
          echo "./data/places/tei/78.xml" >> xml_files.txt
          # echo "./data/places/tei/1507.xml" >> xml_files.txt
          echo "./data/places/tei/1486.xml" >> xml_files.txt
          # echo "./data/places/tei/104.xml" >> xml_files.txt
          echo "./data/places/tei/602.xml" >> xml_files.txt
          # echo "./data/works/tei/315.xml" >> xml_files.txt
          # echo "./data/works/tei/9501.xml" >> xml_files.txt
          # echo "./data/works/tei/nhsl/tei/9723.xml" >> xml_files.txt
          # echo "./data/works/tei/nhsl/tei/9724.xml" >> xml_files.txt
          # echo "./data/works/tei/10510.xml" >> xml_files.txt
          # echo "./data/works/tei/nhsl/tei/10511.xml" >> xml_files.txt 
          # echo "Processing specified XML files:"
          
          cat xml_files.txt   
          

      # Step 5: Run XSLT Transformations and Merge into Single JSON 
      - name: Run XSLT Transformations and Create Bulk JSON
        run: |
          touch bulk_data.json  # Create the bulk JSON file
          # Commented out code in this section for possible optimization of code
          # mkdir -p data-html
          # echo "Created HTML directory"
          
          while IFS= read -r file; do
            echo "Processing $file"
      
            # Extract the document type from the file path
            type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl')
            if [ "$type" == "bibl" ]; then
              type="cbss"
            fi
            echo "Json type $type"
            # Extract the filename and create the index header for OpenSearch bulk format
            filename=$(basename ${file%.xml})
            echo "Processing $filename for JSON"
            printf "{\"index\":{\"_index\":\"syriaca-index-4\",\"_id\":\"$type-$filename\"}}\n" >> bulk_data.json
      
            # Apply XSLT for JSON conversion and append it to bulk_data.json directly
            java -jar saxon.jar -s:$file -xsl:json-stylesheet.xsl docType="$type" | tr -d '\n' >> bulk_data.json
            echo "" >> bulk_data.json  # Add a newline after the document entry

            # Apply XSLT for HTML conversion and capture any error
            # java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:${filename}.html 2>&1 | tee saxon_error.log
            # # Upload the HTML file to S3
            # aws s3 cp $(basename ${file%.xml}.html) s3://srophe-syriaca-front-end/${type}/${filename}.html

          done < xml_files.txt
        env:
          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
          AWS_REGION: ${{ secrets.AWS_REGION }}          

      # Step 6: Convert HTML files
      # - name: Create static HTML directory
      #   run: |
      #     mkdir -p data-html  
      #     echo "Created HTML directory"

     
      # - name: Run XSLT Transformations for HTML
      #   run: |
      #     while IFS= read -r file; do
      #       echo "Processing $file for HTML"
      #       # Extract the document type from the file path
      #       type=$(echo "$file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1)
      #       # Extract the filename and create the index header for OpenSearch bulk format
      #       filename=$(basename ${file%.xml})
      #       echo "html filename: $filename"
      #       # if [ "$type" == "bibl" ]; then
      #       #   type="cbss"
      #       # fi
      #       echo "HTML type $type"

      #       # Run the XSLT transformation located in the root of syriaca-data repository
      #       java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${type}/${filename}.html
      #       # java -jar saxon.jar -s:$file -xsl:html-stylesheet.xsl -o:data-html/${filename}.html

      #     done < xml_files.txt
          
       
      # Step 7: Upload files to S3 
      - name: Upload JSON file to S3 
        run: |
          aws s3 cp bulk_data.json s3://srophe-syriaca-front-end/json-data/advancedsearchfields/bulk_data_places_selected_index_4.json
        env:
          AWS_REGION: ${{ secrets.AWS_REGION }}
          AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}

      # - name: Upload HTML files to S3
      #   run: |
        
      #     for html_file in $(find ./data-html -name "*.html"); do
      #       type=$(echo "$html_file" | grep -o -E 'work|subject|person|place|bibl' | tail -n 1)
      #       echo "html_file $html_file"
      #       if [ "$type" == "subject" ]; then
      #         type="taxonomy"
      #       fi
      #       if [ "$type" == "bibl" ]; then
      #         type="cbss"
      #       fi
      #       # Copy html file to S3 with the idno path
      #       aws s3 cp $html_file s3://srophe-syriaca-front-end/${type}/$(basename ${html_file%.html})
      #     done
          
      #   env:
      #     AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
      #     AWS_REGION: ${{ secrets.AWS_REGION }}


      # Step 8: Upload JSON data to OpenSearch 

      - name: JSON file to OpenSearch

        env:
          OPENSEARCH_URL: ${{ secrets.OPENSEARCH_URL }}
          OPENSEARCH_USER: ${{ secrets.OPENSEARCH_USER }}
          OPENSEARCH_PASSWORD: ${{ secrets.OPENSEARCH_PASSWORD }}
        run: |
          RESPONSE=$(curl -s -o response.json -w "%{http_code}" -XPOST "$OPENSEARCH_URL/_bulk" \
          -H "Content-Type: application/json" \
          -u "$OPENSEARCH_USER:$OPENSEARCH_PASSWORD" \
          --data-binary "@bulk_data.json")
          echo "HTTP response code: $RESPONSE"
          cat response.json

          # Check for errors in the response
          if grep -q '"errors":true' response.json; then
            echo "Errors occurred during bulk upload"
            exit 1
          fi