From 61be25b1c23818d0e54ecf0114a473698b40ac10 Mon Sep 17 00:00:00 2001 From: aarmaane <38818203+aarmaane@users.noreply.github.com> Date: Sun, 28 Jan 2024 20:08:54 -0500 Subject: [PATCH 1/5] Set shell=False on subprocess commands --- madoop/mapreduce.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/madoop/mapreduce.py b/madoop/mapreduce.py index 6afb58b..e98c271 100644 --- a/madoop/mapreduce.py +++ b/madoop/mapreduce.py @@ -178,7 +178,7 @@ def is_executable(exe): try: subprocess.run( str(exe), - shell=True, + shell=False, input="".encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -212,7 +212,7 @@ def map_stage(exe, input_dir, output_dir): try: subprocess.run( str(exe), - shell=True, + shell=False, check=True, stdin=infile, stdout=outfile, @@ -337,7 +337,7 @@ def reduce_stage(exe, input_dir, output_dir): try: subprocess.run( str(exe), - shell=True, + shell=False, check=True, stdin=infile, stdout=outfile, From 273758d76a317c3efd6a74a5b52b4438ecb20b23 Mon Sep 17 00:00:00 2001 From: aarmaane <38818203+aarmaane@users.noreply.github.com> Date: Sun, 28 Jan 2024 23:47:59 -0500 Subject: [PATCH 2/5] Catch OSError exceptions from subprocess changes --- madoop/mapreduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/madoop/mapreduce.py b/madoop/mapreduce.py index e98c271..138617c 100644 --- a/madoop/mapreduce.py +++ b/madoop/mapreduce.py @@ -184,7 +184,7 @@ def is_executable(exe): stderr=subprocess.PIPE, check=True, ) - except subprocess.CalledProcessError as err: + except (subprocess.CalledProcessError, OSError) as err: raise MadoopError(f"Failed executable test: {err}") from err From f595c1a3a86d018cc9d0a99d29e8c91a742858d9 Mon Sep 17 00:00:00 2001 From: Andrew DeOrio Date: Mon, 29 Jan 2024 08:56:56 -0500 Subject: [PATCH 3/5] Add test --- tests/test_api.py | 18 ++++++++++++ .../correct/output/part-00000 | 1 + .../correct/output/part-00001 | 3 ++ .../correct/output/part-00002 | 1 + .../word_count SPACE/input SPACE/input 01.txt | 2 ++ .../word_count SPACE/input SPACE/input 02.txt | 2 ++ tests/testdata/word_count SPACE/map SPACE.py | 9 ++++++ .../testdata/word_count SPACE/reduce SPACE.py | 28 +++++++++++++++++++ 8 files changed, 64 insertions(+) create mode 100644 tests/testdata/word_count SPACE/correct/output/part-00000 create mode 100644 tests/testdata/word_count SPACE/correct/output/part-00001 create mode 100644 tests/testdata/word_count SPACE/correct/output/part-00002 create mode 100644 tests/testdata/word_count SPACE/input SPACE/input 01.txt create mode 100644 tests/testdata/word_count SPACE/input SPACE/input 02.txt create mode 100755 tests/testdata/word_count SPACE/map SPACE.py create mode 100755 tests/testdata/word_count SPACE/reduce SPACE.py diff --git a/tests/test_api.py b/tests/test_api.py index 8754970..951e01a 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -125,3 +125,21 @@ def test_ignores_subdirs(tmpdir): TESTDATA_DIR/"word_count/correct/output", tmpdir/"output", ) + + +def test_input_path_spaces(tmpdir): + """Run a simple MapReduce job with an input directory containing a + subdirectory. The subdirectory should be gracefully ignored. + """ + with tmpdir.as_cwd(): + madoop.mapreduce( + input_path=TESTDATA_DIR/"word_count SPACE/input SPACE", + output_dir="output", + map_exe=TESTDATA_DIR/"word_count SPACE/map SPACE.py", + reduce_exe=TESTDATA_DIR/"word_count SPACE/reduce SPACE.py", + num_reducers=4 + ) + utils.assert_dirs_eq( + TESTDATA_DIR/"word_count/correct/output", + tmpdir/"output", + ) diff --git a/tests/testdata/word_count SPACE/correct/output/part-00000 b/tests/testdata/word_count SPACE/correct/output/part-00000 new file mode 100644 index 0000000..70db879 --- /dev/null +++ b/tests/testdata/word_count SPACE/correct/output/part-00000 @@ -0,0 +1 @@ +Goodbye 1 diff --git a/tests/testdata/word_count SPACE/correct/output/part-00001 b/tests/testdata/word_count SPACE/correct/output/part-00001 new file mode 100644 index 0000000..ecc21b4 --- /dev/null +++ b/tests/testdata/word_count SPACE/correct/output/part-00001 @@ -0,0 +1,3 @@ +Bye 1 +Hadoop 2 +World 2 diff --git a/tests/testdata/word_count SPACE/correct/output/part-00002 b/tests/testdata/word_count SPACE/correct/output/part-00002 new file mode 100644 index 0000000..30f4be7 --- /dev/null +++ b/tests/testdata/word_count SPACE/correct/output/part-00002 @@ -0,0 +1 @@ +Hello 2 diff --git a/tests/testdata/word_count SPACE/input SPACE/input 01.txt b/tests/testdata/word_count SPACE/input SPACE/input 01.txt new file mode 100644 index 0000000..c614f1f --- /dev/null +++ b/tests/testdata/word_count SPACE/input SPACE/input 01.txt @@ -0,0 +1,2 @@ +Hello World +Bye World diff --git a/tests/testdata/word_count SPACE/input SPACE/input 02.txt b/tests/testdata/word_count SPACE/input SPACE/input 02.txt new file mode 100644 index 0000000..acd80a3 --- /dev/null +++ b/tests/testdata/word_count SPACE/input SPACE/input 02.txt @@ -0,0 +1,2 @@ +Hello Hadoop +Goodbye Hadoop diff --git a/tests/testdata/word_count SPACE/map SPACE.py b/tests/testdata/word_count SPACE/map SPACE.py new file mode 100755 index 0000000..4d3caf7 --- /dev/null +++ b/tests/testdata/word_count SPACE/map SPACE.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 +"""Word count mapper.""" +import sys + + +for line in sys.stdin: + words = line.split() + for word in words: + print(f"{word}\t1") diff --git a/tests/testdata/word_count SPACE/reduce SPACE.py b/tests/testdata/word_count SPACE/reduce SPACE.py new file mode 100755 index 0000000..b244fe0 --- /dev/null +++ b/tests/testdata/word_count SPACE/reduce SPACE.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +"""Word count reducer.""" +import sys +import itertools + + +def main(): + """Divide sorted lines into groups that share a key.""" + for key, group in itertools.groupby(sys.stdin, keyfunc): + reduce_one_group(key, group) + + +def keyfunc(line): + """Return the key from a TAB-delimited key-value pair.""" + return line.partition("\t")[0] + + +def reduce_one_group(key, group): + """Reduce one group.""" + word_count = 0 + for line in group: + count = line.partition("\t")[2] + word_count += int(count) + print(f"{key} {word_count}") + + +if __name__ == "__main__": + main() From e95c35bad6cc48a4b27fc6c0946c7190aa58b523 Mon Sep 17 00:00:00 2001 From: Andrew DeOrio Date: Mon, 29 Jan 2024 08:57:03 -0500 Subject: [PATCH 4/5] Remove pdbpp --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5ef014a..74125b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ madoop = "madoop.__main__:main" [project.optional-dependencies] dev = [ - "pdbpp", "build", "twine", "tox", From 969a10c1759cd55ba32b796adbcc5625167fffe3 Mon Sep 17 00:00:00 2001 From: Andrew DeOrio Date: Mon, 29 Jan 2024 08:58:17 -0500 Subject: [PATCH 5/5] Remove unecessary added files --- tests/testdata/word_count SPACE/correct/output/part-00000 | 1 - tests/testdata/word_count SPACE/correct/output/part-00001 | 3 --- tests/testdata/word_count SPACE/correct/output/part-00002 | 1 - 3 files changed, 5 deletions(-) delete mode 100644 tests/testdata/word_count SPACE/correct/output/part-00000 delete mode 100644 tests/testdata/word_count SPACE/correct/output/part-00001 delete mode 100644 tests/testdata/word_count SPACE/correct/output/part-00002 diff --git a/tests/testdata/word_count SPACE/correct/output/part-00000 b/tests/testdata/word_count SPACE/correct/output/part-00000 deleted file mode 100644 index 70db879..0000000 --- a/tests/testdata/word_count SPACE/correct/output/part-00000 +++ /dev/null @@ -1 +0,0 @@ -Goodbye 1 diff --git a/tests/testdata/word_count SPACE/correct/output/part-00001 b/tests/testdata/word_count SPACE/correct/output/part-00001 deleted file mode 100644 index ecc21b4..0000000 --- a/tests/testdata/word_count SPACE/correct/output/part-00001 +++ /dev/null @@ -1,3 +0,0 @@ -Bye 1 -Hadoop 2 -World 2 diff --git a/tests/testdata/word_count SPACE/correct/output/part-00002 b/tests/testdata/word_count SPACE/correct/output/part-00002 deleted file mode 100644 index 30f4be7..0000000 --- a/tests/testdata/word_count SPACE/correct/output/part-00002 +++ /dev/null @@ -1 +0,0 @@ -Hello 2