Skip to content

Commit

Permalink
Fix content not correctly parsed at pdf content importer (#12338)
Browse files Browse the repository at this point in the history
* fix the isFarAway issue

1. is far away should compare with the last one in font size, not the former text
2. fix the abs value issue.
3. fix the Ygap calculate issue.

* add changelog entry

---------

Co-authored-by: Siedlerchr <[email protected]>
  • Loading branch information
leaf-soba and Siedlerchr authored Jan 4, 2025
1 parent 654622a commit 1e837cd
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 13 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv

### Changed

- We improved the offline parsing of BibTeX data from PDF-documents. [#12278](https://github.com/JabRef/jabref/issues/12278)

### Fixed

- We fixed an issue where a bib file with UFF-8 charset was wrongly loaded with a different charset [forum#5369](https://discourse.jabref.org/t/jabref-5-15-opens-bib-files-with-shift-jis-encoding-instead-of-utf-8/5369/)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,13 @@ protected void writeString(String text, List<TextPosition> textPositions) {
}

private boolean isFarAway(TextPosition previous, TextPosition current) {
float XspaceThreshold = 3.0F;
float YspaceThreshold = previous.getFontSizeInPt() * 1.5F;
float XspaceThreshold = previous.getFontSizeInPt() * 3.0F;
float YspaceThreshold = previous.getFontSizeInPt() * 3.0F;
float Xgap = current.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj());
float Ygap = current.getYDirAdj() - (previous.getYDirAdj() - previous.getHeightDir());
return Xgap > XspaceThreshold && Ygap > YspaceThreshold;
float Ygap = current.getYDirAdj() - previous.getYDirAdj();
// For cases like paper titles spanning two or more lines, both X and Y gaps must exceed thresholds,
// so "&&" is used instead of "||".
return Math.abs(Xgap) > XspaceThreshold && Math.abs(Ygap) > YspaceThreshold;
}

private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition textPosition) {
Expand All @@ -258,28 +260,27 @@ private boolean isUnwantedText(TextPosition previousTextPosition, TextPosition t
return true;
}
// The title usually don't in the bottom 10% of a page.
if ((textPosition.getPageHeight() - textPosition.getYDirAdj())
< (textPosition.getPageHeight() * 0.1)) {
return true;
}
// The title character usually stay together.
return isFarAway(previousTextPosition, textPosition);
return (textPosition.getPageHeight() - textPosition.getYDirAdj())
< (textPosition.getPageHeight() * 0.1);
}

private Optional<String> findLargestFontText(List<TextPosition> textPositions) {
Map<Float, StringBuilder> fontSizeTextMap = new TreeMap<>(Collections.reverseOrder());
Map<Float, TextPosition> lastPositionMap = new TreeMap<>(Collections.reverseOrder());
TextPosition previousTextPosition = null;
for (TextPosition textPosition : textPositions) {
float fontSize = textPosition.getFontSizeInPt();
// Exclude unwanted text based on heuristics
if (isUnwantedText(previousTextPosition, textPosition)) {
if (isUnwantedText(previousTextPosition, textPosition) ||
(lastPositionMap.containsKey(fontSize) && isFarAway(lastPositionMap.get(fontSize), textPosition))) {
continue;
}
float fontSize = textPosition.getFontSizeInPt();
fontSizeTextMap.putIfAbsent(fontSize, new StringBuilder());
if (previousTextPosition != null && isThereSpace(previousTextPosition, textPosition)) {
fontSizeTextMap.get(fontSize).append(" ");
}
fontSizeTextMap.get(fontSize).append(textPosition.getUnicode());
lastPositionMap.put(fontSize, textPosition);
previousTextPosition = textPosition;
}
for (Map.Entry<Float, StringBuilder> entry : fontSizeTextMap.entrySet()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ private static Stream<Arguments> providePdfData() {
Arguments.of("On the impact of service-oriented patterns on software evolvability: a controlled experiment and metric-based analysis", "/pdfs/PdfContentImporter/Bogner2019.pdf"),
Arguments.of("Pandemic programming", "/pdfs/PdfContentImporter/Ralph2020.pdf"),
Arguments.of("Do RESTful API design rules have an impact on the understandability of Web APIs?", "/pdfs/PdfContentImporter/Bogner2023.pdf"),
Arguments.of("Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study", "/pdfs/PdfContentImporter/Fritzsch2022.pdf")
Arguments.of("Adopting microservices and DevOps in the cyber-physical systems domain: A rapid review and case study", "/pdfs/PdfContentImporter/Fritzsch2022.pdf"),
Arguments.of("OPIUM: Optimal Package Install/Uninstall Manager", "/pdfs/PdfContentImporter/opium.pdf")
);
}
}
Binary file not shown.

0 comments on commit 1e837cd

Please sign in to comment.