-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
150 lines (125 loc) · 4.96 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Makefile for MapReduce Page Rank project.
# Customize these paths for your environment.
# -----------------------------------------------------------
hadoop.root=/usr/local/hadoop
jar.name=hw4-0.0.1-SNAPSHOT.jar
jar.path=target/${jar.name}
job.name=job1
local.input=/home/shubham/Desktop/MR/assignment4/input
local.output=/home/shubham/Desktop/MR/assignment4/output
# Pseudo-Cluster Execution
hdfs.user.name=joe
hdfs.input=input
hdfs.output=output
# AWS EMR Execution
aws.emr.release=emr-5.2.1
aws.region=us-east-1
aws.bucket.name=mrassignment3
aws.subnet.id=subnet-6356553a
aws.input=s3://mrassignment4/input
aws.output=s3://mrassignment4/output
aws.log.dir=s3://mrassignment4/input/log1
aws.num.nodes=6
aws.instance.type=m4.large
# -----------------------------------------------------------
# Compiles code and builds jar (with dependencies).
jar:
mvn clean package
# Removes local output directory.
clean-local-output:
rm -rf ${local.output}*
# Runs standalone
# Make sure Hadoop is set up (in /etc/hadoop files) for standalone operation (not pseudo-cluster).
# https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Standalone_Operation
alone: jar clean-local-output
${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${local.input} ${local.output}
# Start HDFS
start-hdfs:
${hadoop.root}/sbin/start-dfs.sh
# Stop HDFS
stop-hdfs:
${hadoop.root}/sbin/stop-dfs.sh
# Start YARN
start-yarn: stop-yarn
${hadoop.root}/sbin/start-yarn.sh
# Stop YARN
stop-yarn:
${hadoop.root}/sbin/stop-yarn.sh
# Reformats & initializes HDFS.
format-hdfs: stop-hdfs
rm -rf /tmp/hadoop*
${hadoop.root}/bin/hdfs namenode -format
# Initializes user & input directories of HDFS.
init-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -rm -r -f /user
${hadoop.root}/bin/hdfs dfs -mkdir /user
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}
${hadoop.root}/bin/hdfs dfs -mkdir /user/${hdfs.user.name}/${hdfs.input}
# Load data to HDFS
upload-input-hdfs: start-hdfs
${hadoop.root}/bin/hdfs dfs -put ${local.input}/* /user/${hdfs.user.name}/${hdfs.input}
# Removes hdfs output directory.
clean-hdfs-output:
${hadoop.root}/bin/hdfs dfs -rm -r -f ${hdfs.output}*
# Download output from HDFS to local.
download-output:
mkdir ${local.output}
${hadoop.root}/bin/hdfs dfs -get ${hdfs.output}/* ${local.output}
# Runs pseudo-clustered (ALL). ONLY RUN THIS ONCE, THEN USE: make pseudoq
# Make sure Hadoop is set up (in /etc/hadoop files) for pseudo-clustered operation (not standalone).
# https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html#Pseudo-Distributed_Operation
pseudo: jar stop-yarn format-hdfs init-hdfs upload-input-hdfs start-yarn clean-local-output
${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${hdfs.input} ${hdfs.output}
make download-output
# Runs pseudo-clustered (quickie).
pseudoq: jar clean-local-output clean-hdfs-output
${hadoop.root}/bin/hadoop jar ${jar.path} ${job.name} ${hdfs.input} ${hdfs.output}
make download-output
# Create S3 bucket.
make-bucket:
aws s3 mb s3://${aws.bucket.name}
# Upload data to S3 input dir.
upload-input-aws: make-bucket
aws s3 sync ${local.input} s3://${aws.bucket.name}/${aws.input}
# Delete S3 output dir.
delete-output-aws:
aws s3 rm s3://${aws.bucket.name}/ --recursive --exclude "*" --include "${aws.output}*"
# Upload application to S3 bucket.
upload-app-aws:
aws s3 cp ${jar.path} s3://${aws.bucket.name}
# Main EMR launch.
cloud: jar upload-app-aws delete-output-aws
aws emr create-cluster \
--name "Cluster" \
--release-label ${aws.emr.release} \
--instance-groups '[{"InstanceCount":${aws.num.nodes},"InstanceGroupType":"CORE","InstanceType":"${aws.instance.type}"},{"InstanceCount":1,"InstanceGroupType":"MASTER","InstanceType":"${aws.instance.type}"}]' \
--applications Name=Hadoop \
--steps '[{"Args":["${job.name}","s3://${aws.bucket.name}/${aws.input}","s3://${aws.bucket.name}/${aws.output}"],"Type":"CUSTOM_JAR","Jar":"s3://${aws.bucket.name}/${jar.name}","ActionOnFailure":"TERMINATE_CLUSTER","Name":"Custom JAR"}]' \
--log-uri s3://${aws.bucket.name}/${aws.log.dir} \
--service-role EMR_DefaultRole \
--ec2-attributes InstanceProfile=EMR_EC2_DefaultRole,SubnetId=${aws.subnet.id} \
--region ${aws.region} \
--enable-debugging \
--auto-terminate
# Download output from S3.
download-output-aws: clean-local-output
mkdir ${local.output}
aws s3 sync s3://${aws.bucket.name}/${aws.output} ${local.output}
# Change to standalone mode.
switch-standalone:
cp config/standalone/*.xml ${hadoop.root}/etc/hadoop
# Change to pseudo-cluster mode.
switch-pseudo:
cp config/pseudo/*.xml ${hadoop.root}/etc/hadoop
# Package for release.
distro:
rm -rf build
mkdir build
mkdir build/deliv
mkdir build/deliv/WordCount
cp pom.xml build/deliv/WordCount
cp -r src build/deliv/WordCount
cp Makefile build/deliv/WordCount
cp README.txt build/deliv/WordCount
tar -czf WordCount.tar.gz -C build/deliv WordCount
cd build/deliv && zip -rq ../../WordCount.zip WordCount