Skip to content

Commit

Permalink
Merge pull request #73 from aarmaane/develop
Browse files Browse the repository at this point in the history
Fix directories containing spaces
  • Loading branch information
awdeorio authored Jan 29, 2024
2 parents 5408a7d + 969a10c commit 8cf2165
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 5 deletions.
8 changes: 4 additions & 4 deletions madoop/mapreduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,13 +178,13 @@ def is_executable(exe):
try:
subprocess.run(
str(exe),
shell=True,
shell=False,
input="".encode(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
)
except subprocess.CalledProcessError as err:
except (subprocess.CalledProcessError, OSError) as err:
raise MadoopError(f"Failed executable test: {err}") from err


Expand Down Expand Up @@ -212,7 +212,7 @@ def map_stage(exe, input_dir, output_dir):
try:
subprocess.run(
str(exe),
shell=True,
shell=False,
check=True,
stdin=infile,
stdout=outfile,
Expand Down Expand Up @@ -337,7 +337,7 @@ def reduce_stage(exe, input_dir, output_dir):
try:
subprocess.run(
str(exe),
shell=True,
shell=False,
check=True,
stdin=infile,
stdout=outfile,
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ madoop = "madoop.__main__:main"

[project.optional-dependencies]
dev = [
"pdbpp",
"build",
"twine",
"tox",
Expand Down
18 changes: 18 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,21 @@ def test_ignores_subdirs(tmpdir):
TESTDATA_DIR/"word_count/correct/output",
tmpdir/"output",
)


def test_input_path_spaces(tmpdir):
"""Run a simple MapReduce job with an input directory containing a
subdirectory. The subdirectory should be gracefully ignored.
"""
with tmpdir.as_cwd():
madoop.mapreduce(
input_path=TESTDATA_DIR/"word_count SPACE/input SPACE",
output_dir="output",
map_exe=TESTDATA_DIR/"word_count SPACE/map SPACE.py",
reduce_exe=TESTDATA_DIR/"word_count SPACE/reduce SPACE.py",
num_reducers=4
)
utils.assert_dirs_eq(
TESTDATA_DIR/"word_count/correct/output",
tmpdir/"output",
)
2 changes: 2 additions & 0 deletions tests/testdata/word_count SPACE/input SPACE/input 01.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Hello World
Bye World
2 changes: 2 additions & 0 deletions tests/testdata/word_count SPACE/input SPACE/input 02.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Hello Hadoop
Goodbye Hadoop
9 changes: 9 additions & 0 deletions tests/testdata/word_count SPACE/map SPACE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env python3
"""Word count mapper."""
import sys


for line in sys.stdin:
words = line.split()
for word in words:
print(f"{word}\t1")
28 changes: 28 additions & 0 deletions tests/testdata/word_count SPACE/reduce SPACE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env python3
"""Word count reducer."""
import sys
import itertools


def main():
"""Divide sorted lines into groups that share a key."""
for key, group in itertools.groupby(sys.stdin, keyfunc):
reduce_one_group(key, group)


def keyfunc(line):
"""Return the key from a TAB-delimited key-value pair."""
return line.partition("\t")[0]


def reduce_one_group(key, group):
"""Reduce one group."""
word_count = 0
for line in group:
count = line.partition("\t")[2]
word_count += int(count)
print(f"{key} {word_count}")


if __name__ == "__main__":
main()

0 comments on commit 8cf2165

Please sign in to comment.