Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
awdeorio committed Nov 4, 2023
2 parents 3eb2ecb + 5408a7d commit f2f9dac
Show file tree
Hide file tree
Showing 20 changed files with 233 additions and 80 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/continuous_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
strategy:
# Define OS and Python versions to use. 3.x is the latest minor version.
matrix:
python-version: ["3.6", "3.x"] # 3.x is the latest minor version
python-version: ["3.x"] # 3.x is the latest minor version
os: [ubuntu-latest]

# Sequence of tasks for this job
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,6 @@ Create a release on GitHub using the "Auto-generate release notes" feature. http

Upload to PyPI
```console
$ python3 setup.py sdist bdist_wheel
$ python3 -m build
$ twine upload --sign dist/*
```
7 changes: 6 additions & 1 deletion madoop/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ def main():
'-v', '--verbose', action='count', default=0,
help="verbose output"
)
optional_args.add_argument(
'-numReduceTasks', dest='num_reducers', default=4,
help="max number of reducers"
)
required_args = parser.add_argument_group('required arguments')
required_args.add_argument('-input', dest='input', required=True)
required_args.add_argument('-output', dest='output', required=True)
Expand All @@ -56,10 +60,11 @@ def main():
# Run MapReduce API
try:
mapreduce(
input_dir=args.input,
input_path=args.input,
output_dir=args.output,
map_exe=args.mapper,
reduce_exe=args.reducer,
num_reducers=int(args.num_reducers)
)
except MadoopError as err:
sys.exit(f"Error: {err}")
Expand Down
63 changes: 44 additions & 19 deletions madoop/mapreduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,13 @@
MAX_INPUT_SPLIT_SIZE = 2**20 # 1 MB

# The number of reducers is dynamically determined by the number of unique keys
# but will not be more than MAX_NUM_REDUCE
MAX_NUM_REDUCE = 4
# but will not be more than num_reducers

# Madoop logger
LOGGER = logging.getLogger("madoop")


def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
def mapreduce(input_path, output_dir, map_exe, reduce_exe, num_reducers):
"""Madoop API."""
# Do not clobber existing output directory
output_dir = pathlib.Path(output_dir)
Expand Down Expand Up @@ -54,8 +53,8 @@ def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
reduce_output_dir.mkdir()

# Copy and rename input files: part-00000, part-00001, etc.
input_dir = pathlib.Path(input_dir)
prepare_input_files(input_dir, map_input_dir)
input_path = pathlib.Path(input_path)
prepare_input_files(input_path, map_input_dir)

# Executables must be absolute paths
map_exe = pathlib.Path(map_exe).resolve()
Expand All @@ -74,6 +73,7 @@ def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
group_stage(
input_dir=map_output_dir,
output_dir=reduce_input_dir,
num_reducers=num_reducers
)

# Run the reducing stage
Expand All @@ -98,25 +98,23 @@ def mapreduce(input_dir, output_dir, map_exe, reduce_exe):
LOGGER.info("Output directory: %s", output_dir)


def prepare_input_files(input_dir, output_dir):
def prepare_input_files(input_path, output_dir):
"""Copy and split input files. Rename to part-00000, part-00001, etc.
If a file in input_dir is smaller than MAX_INPUT_SPLIT_SIZE, then copy it
to output_dir. For larger files, split into blocks of MAX_INPUT_SPLIT_SIZE
bytes and write block to output_dir. Input files will never be combined.
The input_path can be a file or a directory of files. If a file is smaller
than MAX_INPUT_SPLIT_SIZE, then copy it to output_dir. For larger files,
split into blocks of MAX_INPUT_SPLIT_SIZE bytes and write block to
output_dir. Input files will never be combined.
The number of files created will be the number of mappers since we will
assume that the number of tasks per mapper is 1. Apache Hadoop has a
configurable number of tasks per mapper, however for both simplicity and
because our use case has smaller inputs we use 1.
"""
assert input_dir.is_dir(), f"Can't find input_dir '{input_dir}'"

# Split and copy input files
part_num = 0
total_size = 0
for inpath in sorted(input_dir.glob('*')):
for inpath in normalize_input_paths(input_path):
assert inpath.is_file()

# Compute output filenames
Expand Down Expand Up @@ -148,6 +146,26 @@ def prepare_input_files(input_dir, output_dir):
LOGGER.debug("total input size=%sB", total_size)


def normalize_input_paths(input_path):
"""Return a list of filtered input files.
If input_path is a file, then use it. If input_path is a directory, then
grab all the *files* inside. Ignore subdirectories.
"""
input_paths = []
if input_path.is_dir():
for path in sorted(input_path.glob('*')):
if path.is_file():
input_paths.append(path)
else:
LOGGER.warning("Ignoring non-file: %s", path)
elif input_path.is_file():
input_paths.append(input_path)
assert input_paths, f"No input: {input_path}"
return input_paths


def is_executable(exe):
"""Verify exe is executable and raise exception if it is not.
Expand Down Expand Up @@ -222,37 +240,43 @@ def keyhash(key):
return int(hexdigest, base=16)


def partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats):
def partition_keys(
inpath,
outpaths,
input_keys_stats,
output_keys_stats,
num_reducers):
"""Allocate lines of inpath among outpaths using hash of key.
Update the data structures provided by the caller input_keys_stats and
output_keys_stats. Both map a filename to a set of of keys.
"""
assert len(outpaths) == MAX_NUM_REDUCE
assert len(outpaths) == num_reducers
outparent = outpaths[0].parent
assert all(i.parent == outparent for i in outpaths)
with contextlib.ExitStack() as stack:
outfiles = [stack.enter_context(p.open("a")) for p in outpaths]
for line in stack.enter_context(inpath.open()):
key = line.partition('\t')[0]
input_keys_stats[inpath].add(key)
reducer_idx = keyhash(key) % MAX_NUM_REDUCE
reducer_idx = keyhash(key) % num_reducers
outfiles[reducer_idx].write(line)
outpath = outpaths[reducer_idx]
output_keys_stats[outpath].add(key)


def group_stage(input_dir, output_dir):
def group_stage(input_dir, output_dir, num_reducers):
"""Run group stage.
Process each mapper output file, allocating lines to grouper output files
using the hash and modulo of the key.
"""
# Compute output filenames
LOGGER.debug("%s reducers", num_reducers)
outpaths = []
for i in range(MAX_NUM_REDUCE):
for i in range(num_reducers):
outpaths.append(output_dir/part_filename(i))

# Track keyspace stats, map filename -> set of keys
Expand All @@ -261,7 +285,8 @@ def group_stage(input_dir, output_dir):

# Partition input, appending to output files
for inpath in sorted(input_dir.iterdir()):
partition_keys(inpath, outpaths, input_keys_stats, output_keys_stats)
partition_keys(inpath, outpaths, input_keys_stats,
output_keys_stats, num_reducers)

# Log input keyspace stats
all_input_keys = set()
Expand Down
43 changes: 43 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
[build-system]
requires = ["setuptools>=64.0.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "madoop"
version = "1.1.0"
description="A light weight MapReduce framework for education."
license = {file = "LICENSE"}
authors = [
{name = "Andrew DeOrio", email = "[email protected]"}
]
readme = "README.md"
keywords = [
"madoop", "Hadoop", "MapReduce", "Michigan Hadoop", "Hadoop Streaming"
]
requires-python = ">=3.6"

[project.urls]
repository = "https://github.com/eecs485staff/madoop/"
documentation = "https://github.com/eecs485staff/madoop/blob/develop/README_Hadoop_Streaming.md#hadoop-streaming-in-python"

[project.scripts]
madoop = "madoop.__main__:main"

[project.optional-dependencies]
dev = [
"pdbpp",
"build",
"twine",
"tox",
"check-manifest",
"freezegun",
"pycodestyle",
"pydocstyle",
"pylint",
"pytest",
"pytest-cov",
]

[tool.setuptools.packages.find]
where = ["."]
include = ["madoop*"]
48 changes: 0 additions & 48 deletions setup.py

This file was deleted.

Loading

0 comments on commit f2f9dac

Please sign in to comment.