Skip to content

Commit

Permalink
Merge pull request #401 from Klimatbyran/staging
Browse files Browse the repository at this point in the history
Stop early if we don't get any content from the parsed PDF
  • Loading branch information
hugo-nl authored Dec 6, 2024
2 parents c7bbfd5 + 6402f29 commit bf2fa80
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion src/workers/nlmParsePDF.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import redis from '../config/redis'
import precheck from './precheck'
import { jsonToMarkdown } from '../lib/jsonExtraction'
import { vectorDB } from '../lib/vectordb'
import { ParsedDocument } from '../lib/nlm-ingestor-schema'

const headers = {
'User-Agent':
Expand Down Expand Up @@ -41,7 +42,7 @@ const nlmParsePDF = new DiscordWorker(
await job.editMessage('Nu borde det vara klart... 🤔')
}
}, 10000)
let json
let json: ParsedDocument
try {
json = await extractJsonFromPdf(pdf)
} catch (err) {
Expand All @@ -59,6 +60,12 @@ const nlmParsePDF = new DiscordWorker(
clearInterval(interval)
}
const markdown = jsonToMarkdown(json)

if (!json.return_dict.result.blocks.length || !markdown.trim()) {
await job.editMessage('❌ Fel vid tolkning av PDF: Inget innehåll')
throw new Error('No content in parsed PDF: ' + JSON.stringify(json))
}

job.log('text found:\n' + markdown)
job.updateData({
...job.data,
Expand Down

0 comments on commit bf2fa80

Please sign in to comment.