-
Notifications
You must be signed in to change notification settings - Fork 2
/
mrjob.conf.example
55 lines (43 loc) · 1.79 KB
/
mrjob.conf.example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
runners:
inline:
base_tmp_dir: /data/tmp
local:
base_tmp_dir: /data/tmp
emr:
# 1. Change these to your AWS access key and secret
aws_access_key_id: YOUR_ACCESS_KEY
aws_secret_access_key: YOUR_SECRET_KEY
aws_region: us-east-1
ec2_master_instance_type: r3.xlarge
ec2_instance_type: c3.2xlarge
ami_version: 3.1.0
# 2. Specify a bid price if you want to use spot instances or delete line
ec2_core_instance_bid_price: '0.1'
# 3. Specify number of core instances (workers)
num_ec2_core_instances: 2
interpreter: /env/bin/python
cmdenv:
PATH: /env/bin
PYTHON: /env/bin/python
# 4. Set scratch URI to your bucket and path (path can be anything, example provided)
s3_scratch_uri: s3://YOUR_BUCKET/jobs/tmp/mrjob/
bootstrap:
# 5. Copy your access key and secret key from step 1 so the MapReduce machines have it
- echo "[Credentials]" >> ~/.boto
- echo "aws_access_key_id=YOUR_ACCESS_KEY" >> ~/.boto
- echo "aws_secret_access_key=YOUR_SECRET_KEY" >> ~/.boto
- export AWS_ACCESS_KEY_ID="YOUR_ACCESS_KEY"
- export AWS_SECRET_ACCESS_KEY="YOUR_SECRET_KEY"
- export AWS_DEFAULT_REGION="us-east-1"
# 6. (optional): install any extra packages via yum
- sudo yum install -y git python27 libxml2 libxml2-devel libxslt libxslt-devel python27-devel python-pip gcc
- sudo pip install virtualenv
- sudo mkdir /env
- sudo mkdir -p /opt/src
- GROUP=`groups | sed -r 's/ .*//g'`
- sudo chown -R $USER:$GROUP /env /opt/src
- virtualenv /env -p /usr/bin/python2.7
- source /env/bin/activate
# 7. (optional): install any Python packages beyond what gets installed with setup.py
#- /env/bin/pip install YOUR_PACKAGES
- /env/bin/pip install git+https://github.com/openvenues/common_crawl.git