From 6402f2996b2b144209d280fbe2d0ec7f06309c21 Mon Sep 17 00:00:00 2001 From: Samuel Plumppu <6125097+Greenheart@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:55:36 +0100 Subject: [PATCH] Stop early if we don't get any content from the parsed PDF --- src/workers/nlmParsePDF.ts | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/workers/nlmParsePDF.ts b/src/workers/nlmParsePDF.ts index a174d3ab..f06e9fe9 100644 --- a/src/workers/nlmParsePDF.ts +++ b/src/workers/nlmParsePDF.ts @@ -5,6 +5,7 @@ import redis from '../config/redis' import precheck from './precheck' import { jsonToMarkdown } from '../lib/jsonExtraction' import { vectorDB } from '../lib/vectordb' +import { ParsedDocument } from '../lib/nlm-ingestor-schema' const headers = { 'User-Agent': @@ -41,7 +42,7 @@ const nlmParsePDF = new DiscordWorker( await job.editMessage('Nu borde det vara klart... đŸ€”') } }, 10000) - let json + let json: ParsedDocument try { json = await extractJsonFromPdf(pdf) } catch (err) { @@ -59,6 +60,12 @@ const nlmParsePDF = new DiscordWorker( clearInterval(interval) } const markdown = jsonToMarkdown(json) + + if (!json.return_dict.result.blocks.length || !markdown.trim()) { + await job.editMessage('❌ Fel vid tolkning av PDF: Inget innehĂ„ll') + throw new Error('No content in parsed PDF: ' + JSON.stringify(json)) + } + job.log('text found:\n' + markdown) job.updateData({ ...job.data,