Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functionality to replace text #482

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions camelot/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ def set_config(self, key, value):
help="Characters that should be stripped from a string before"
" assigning it to a cell.",
)
@click.option(
"-replace",
"--replace_text",
help="Characters that should be replaced from a string before"
" assigning it to a cell.",
)
@click.option(
"-M",
"--margins",
Expand Down
3 changes: 3 additions & 0 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ def read_pdf(
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
to generate rows.
Expand Down
6 changes: 6 additions & 0 deletions camelot/parsers/lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ class Lattice(BaseParser):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
line_tol : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
lines.
Expand Down Expand Up @@ -99,6 +102,7 @@ def __init__(
split_text=False,
flag_size=False,
strip_text="",
replace_text={},
line_tol=2,
joint_tol=2,
threshold_blocksize=15,
Expand All @@ -117,6 +121,7 @@ def __init__(
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.replace_text = replace_text
self.line_tol = line_tol
self.joint_tol = joint_tol
self.threshold_blocksize = threshold_blocksize
Expand Down Expand Up @@ -360,6 +365,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
replace_text=self.replace_text,
)
if indices[0][:2] != (-1, -1):
pos_errors.append(error)
Expand Down
6 changes: 6 additions & 0 deletions camelot/parsers/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class Stream(BaseParser):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.
replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
row_tol : int, optional (default: 2)
Expand All @@ -64,6 +67,7 @@ def __init__(
split_text=False,
flag_size=False,
strip_text="",
replace_text={},
edge_tol=50,
row_tol=2,
column_tol=0,
Expand All @@ -76,6 +80,7 @@ def __init__(
self.split_text = split_text
self.flag_size = flag_size
self.strip_text = strip_text
self.replace_text = replace_text
self.edge_tol = edge_tol
self.row_tol = row_tol
self.column_tol = column_tol
Expand Down Expand Up @@ -414,6 +419,7 @@ def _generate_table(self, table_idx, cols, rows, **kwargs):
split_text=self.split_text,
flag_size=self.flag_size,
strip_text=self.strip_text,
replace_text=self.replace_text,
)
if indices[:2] != (-1, -1):
pos_errors.append(error)
Expand Down
79 changes: 65 additions & 14 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,12 +505,33 @@ def text_strip(text, strip=""):
return stripped


def text_replace(text, replace={}):
"""Replaces the keys for the values that are present in `text`.
Parameters
----------
text : str
Text to process and modify.
replace : dict, optional (default: {})
key value pairs, where keys are swapped for the values in `text`.
Returns
-------
text : str
"""
if replace is {}:
return text

for key, value in replace.items():
text = text.replace(key, value)

return text


# TODO: combine the following functions into a TextProcessor class which
# applies corresponding transformations sequentially
# (inspired from sklearn.pipeline.Pipeline)


def flag_font_size(textline, direction, strip_text=""):
def flag_font_size(textline, direction, strip_text="", replace_text={}):
"""Flags super/subscripts in text by enclosing them with <s></s>.
May give false positives.

Expand All @@ -523,7 +544,9 @@ def flag_font_size(textline, direction, strip_text=""):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.

replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
Returns
-------
fstring : string
Expand Down Expand Up @@ -559,10 +582,14 @@ def flag_font_size(textline, direction, strip_text=""):
fstring = "".join(flist)
else:
fstring = "".join([t.get_text() for t in textline])

fstring = text_replace(fstring, replace_text)
return text_strip(fstring, strip_text)


def split_textline(table, textline, direction, flag_size=False, strip_text=""):
def split_textline(
table, textline, direction, flag_size=False, strip_text="", replace_text={}
):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.

Expand All @@ -580,7 +607,9 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.

replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
Returns
-------
grouped_chars : list
Expand Down Expand Up @@ -668,20 +697,28 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
key[0],
key[1],
flag_font_size(
[t[2] for t in chars], direction, strip_text=strip_text
[t[2] for t in chars],
direction,
strip_text=strip_text,
replace_text=replace_text,
),
)
)
else:
gchars = [t[2].get_text() for t in chars]
grouped_chars.append(
(key[0], key[1], text_strip("".join(gchars), strip_text))
)
gchars = "".join([t[2].get_text() for t in chars])
gchars = text_replace(gchars, replace_text)
grouped_chars.append((key[0], key[1], text_strip(gchars, strip_text)))
return grouped_chars


def get_table_index(
table, t, direction, split_text=False, flag_size=False, strip_text=""
table,
t,
direction,
split_text=False,
flag_size=False,
strip_text="",
replace_text={},
):
"""Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
Expand All @@ -703,7 +740,9 @@ def get_table_index(
strip_text : str, optional (default: '')
Characters that should be stripped from a string before
assigning it to a cell.

replace_text : dict, optional (default: {})
Characters that should be replaced from a string before
assigning it to a cell.
Returns
-------
indices : list
Expand Down Expand Up @@ -761,7 +800,12 @@ def get_table_index(
if split_text:
return (
split_textline(
table, t, direction, flag_size=flag_size, strip_text=strip_text
table,
t,
direction,
flag_size=flag_size,
strip_text=strip_text,
replace_text=replace_text,
),
error,
)
Expand All @@ -772,13 +816,20 @@ def get_table_index(
(
r_idx,
c_idx,
flag_font_size(t._objs, direction, strip_text=strip_text),
flag_font_size(
t._objs,
direction,
strip_text=strip_text,
replace_text=replace_text,
),
)
],
error,
)
else:
return [(r_idx, c_idx, text_strip(t.get_text(), strip_text))], error
text = t.get_text()
text = text_replace(text, replace_text)
return [(r_idx, c_idx, text_strip(text, strip_text))], error


def compute_accuracy(error_weights):
Expand Down
Binary file added tests/.DS_Store
Binary file not shown.
52 changes: 52 additions & 0 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2306,6 +2306,32 @@
["ChâteauLéoube2016", "10€"],
]

data_stream_replace_text = [
["VinsauVerre", ""],
["LesBlancs", "12.5CL"],
["A.O.PCôtesduRhône", ""],
["DomainedelaGuicharde«Autourdelachapelle»3316", "8$"],
["A.O.PVacqueyras", ""],
["DomainedeMontvac«Melodine»3316", "10$"],
["A.O.PChâteauneufduPape", ""],
["DomainedeBeaurenard3317", "13$"],
["A.O.PCôteauxduLanguedoc", ""],
["VillaTempora«Untempspourelle»3314", "9$"],
["A.O.PCôtesdeProvence", ""],
["ChâteauGrandBoise3317", "9$"],
["LesRosés", "125CL"],
["A.O.PCôtesduRhône", ""],
["DomainedelaFlorane«AfleurdePampre»3316", "8$"],
["FamilleCoulon(DomaineBeaurenard)Biotifulfox3317", "8$"],
["A.O.PVacqueyras", ""],
["DomainedeMontvac3317", "9$"],
["A.O.PLanguedoc", ""],
["DomainedeJoncas«Nébla»3315", "8$"],
["VillaTempora«L’arroseurarrosé»3315", "9$"],
["A.O.PCôtesdeProvence", ""],
["ChâteauGrandBoise«SainteVictoire»3317", "9$"],
["ChâteauLéoube3316", "10$"],
]
data_stream_edge_tol = [
["Key figures", ""],
["", "2016"],
Expand Down Expand Up @@ -2368,6 +2394,32 @@
["4171_1", "0.07", "173.9", "58.1%", "1.6%", "2.1%", "0.5%"],
]

data_lattice_text_replace = [
[
"Cycle \nName",
"KI \n(1/km)",
"Distance \n(mi)",
"Percent Fuel Savings",
"",
"",
"",
],
[
"",
"",
"",
"Improved \nSpeed",
"Decreased \nAccel",
"Eliminate \nStops",
"Decreased \nIdle",
],
["2012_2", "3,30", "1,3", "5,9%", "9,5%", "29,2%", "17,4%"],
["2145_1", "0,68", "11,2", "2,4%", "0,1%", "9,5%", "2,7%"],
["4234_1", "0,59", "58,7", "8,5%", "1,3%", "8,5%", "3,3%"],
["2032_2", "0,17", "57,8", "21,7%", "0,3%", "2,7%", "1,2%"],
["4171_1", "0,07", "173,9", "58,1%", "1,6%", "2,1%", "0,5%"],
]

data_lattice_table_rotated = [
[
"State",
Expand Down
11 changes: 11 additions & 0 deletions tests/test_lattice.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ def test_lattice(testdir):
assert_frame_equal(df, tables[0].df)


@skip_on_windows
def test_lattice_text_replace(testdir):
df = pd.DataFrame(data_lattice_text_replace)

filename = os.path.join(
testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
)
tables = camelot.read_pdf(filename, pages="2", replace_text={".": ","})
assert_frame_equal(df, tables[0].df)


@skip_on_windows
def test_lattice_table_rotated(testdir):
df = pd.DataFrame(data_lattice_table_rotated)
Expand Down
14 changes: 14 additions & 0 deletions tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,20 @@ def test_stream_strip_text(testdir):
assert_frame_equal(df, tables[0].df)


def test_stream_replace_text(testdir):
df = pd.DataFrame(data_stream_replace_text)

filename = os.path.join(testdir, "detect_vertical_false.pdf")
tables = camelot.read_pdf(
filename,
flavor="stream",
strip_text=" ,\n",
replace_text={"€": "$", "20": "33"},
)

assert_frame_equal(df, tables[0].df)


def test_stream_edge_tol(testdir):
df = pd.DataFrame(data_stream_edge_tol)

Expand Down