Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

hw #9

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open

hw #9

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
.idea
/input
/output*
2017-big-data.iml
classes/
src/main/java/META-INF/
src/main/java/greg/META-INF/
target/

101 changes: 101 additions & 0 deletions src/main/java/greg/NamesCount.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package greg;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NamesCount extends Configured implements Tool {

private static final Pattern namePattern = Pattern.compile("^[A-Z][a-z0-9]*$");

public static class MyMapper extends Mapper<LongWritable, Text, Text, TextWithCountWritable> {

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
final String[] data = value.toString().split("\t");

String inputString = data[1];
int inputCount = Integer.valueOf(data[0]);

context.write(new Text(inputString.toLowerCase()), new TextWithCountWritable(inputString, inputCount));
}
}

static class MyReducer extends Reducer<Text, TextWithCountWritable, Text, IntWritable>{

@Override
protected void reduce(Text key, Iterable<TextWithCountWritable> values, Context context) throws IOException, InterruptedException {
int allCount = 0;
int asNameCount= 0;
String asNameText = null;

for (final TextWithCountWritable value : values){
allCount += value.getCount();

if (asNameText == null){
Matcher matcher = namePattern.matcher(value.getText());
if (matcher.matches()){
asNameText = value.getText();
asNameCount = value.getCount();
}
}
}

if (asNameText == null){
return;
}

if (asNameCount / (double)allCount >= 0.995){
context.write(new Text(asNameText), new IntWritable(asNameCount));
}
}
}


@Override
public int run(String[] args) throws Exception {

final Job job = new Job(new Configuration(), "count names");
job.setJarByClass(NamesCount.class);

job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);

TextInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));

job.setInputFormatClass(TextInputFormat.class);
//job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TextWithCountWritable.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

return job.waitForCompletion(true) ? 0 : 1;
}

public static void main(String[] args) throws Exception {
if(args.length != 2) {
throw new IllegalArgumentException("Usage: hadoop jar <name>.jar <Class> <folder_in> <folder_out>");
}

final int returnCode = ToolRunner.run(new Configuration(), new NamesCount(), args);

System.exit(returnCode);
}
}
91 changes: 91 additions & 0 deletions src/main/java/greg/SortWords.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package greg;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;

public class SortWords extends Configured implements Tool {

private static int counter = 0;

public static class MyMapper extends Mapper <LongWritable, Text, IntWritable, Text> {

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
final String[] data = value.toString().split("\t");
context.write(new IntWritable(Integer.valueOf(data[1])), new Text(data[0]));
}
}

public static class MyReducer extends Reducer <IntWritable, Text, Text, Text> {

@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
counter++;
if(7 == counter) {
System.out.println("\n\n\n\n\n\n\n\n\n\n\n" +
values.iterator().next().toString() +
"\t" + key.toString() + "\n\n\n\n\n\n\n\n\n\n");
}
super.reduce(key, values, context);
}
}

public static class Comparetor extends WritableComparator {

@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return -Integer.compare(readInt(b1, s1), readInt(b2, s2));
}
}

@Override
public int run(String[] args) throws Exception {
final Job job = new Job(new Configuration(), "sort words");
job.setJarByClass(SortWords.class);

job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setSortComparatorClass(Comparetor.class);

TextInputFormat.addInputPath(job, new Path(args[1]));
TextOutputFormat.setOutputPath(job, new Path(args[1] + "_2"));

job.setInputFormatClass(TextInputFormat.class);
//job.setInputFormatClass(SequenceFileInputFormat.class);
//job.setOutputFormatClass(TextOutputFormat.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

return job.waitForCompletion(true) ? 0 : 1;
}

public static void main(final String[] args) throws Exception {

if(args.length != 2) {
throw new IllegalArgumentException("Usage: hadoop jar <name>.jar <Class> <folder_in> <folder_out>");
}

final int returnCode1 = ToolRunner.run(new Configuration(), new greg.WordCount(), args);
final int returnCode2 = ToolRunner.run(new Configuration(), new greg.SortWords(), args);

if(0 == returnCode1 && 0 == returnCode2)
System.exit(0);
System.exit(1);
}
}
102 changes: 102 additions & 0 deletions src/main/java/greg/StopWords.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package greg;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.Scanner;
import java.util.StringTokenizer;

public class StopWords extends Configured implements Tool {

private static HashSet<String> stopWords = new HashSet<String>();

public static void readStopWords(String stopWordsFilePath) {
try {
Scanner sc = new Scanner(new File(stopWordsFilePath));
while(sc.hasNext()) {
stopWords.add(sc.nextLine());
}
} catch (FileNotFoundException e) {
System.out.println(e.getMessage());
}
}

public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer tokenizer = new StringTokenizer(value.toString());
while (tokenizer.hasMoreTokens()) {
String word = tokenizer.nextToken();
context.getCounter(MATCH_COUNTER.ALL).increment(1);
if (stopWords.contains(word)) {
System.out.println("Contains: " + word);
context.getCounter(MATCH_COUNTER.STOP_WORD).increment(1);
}
}
}
}

public enum MATCH_COUNTER {
STOP_WORD,
ALL
}

@Override
public int run(String[] args) throws Exception {

final Job job = new Job(new Configuration(), "stop words");
job.setJarByClass(StopWords.class);

job.setMapperClass(MyMapper.class);

TextInputFormat.addInputPath(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));

job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
// job.setMapOutputKeyClass(IntWritable.class);
// job.setMapOutputValueClass(Text.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

int exitCode = job.waitForCompletion(true) ? 0 : 1;

Counters counter = job.getCounters();

System.out.println("%: "
+ ((double) counter.findCounter(MATCH_COUNTER.STOP_WORD).getValue()
/ counter.findCounter(MATCH_COUNTER.ALL).getValue()));

return exitCode;
}

public static void main(String[] args) throws Exception {

if(args.length != 3) {
throw new IllegalArgumentException("Usage: hadoop jar <name>.jar <Class> <file_in> <folder_out> <stop_words_file>");
}

readStopWords(args[2]);

final int returnCode = ToolRunner.run(new Configuration(), new greg.StopWords(), args);

System.exit(returnCode);
}
}
77 changes: 77 additions & 0 deletions src/main/java/greg/TextWithCountWritable.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package greg;

// based on implementation of Georgiy0 (https://github.com/Georgiy0/2017-big-data),
// which is based on implementation of a-filippo (https://github.com/a-filippo/2017-big-data)

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class TextWithCountWritable implements WritableComparable<TextWithCountWritable>, Cloneable {
private String text;
private int count;

@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(count);
dataOutput.writeUTF(text);
}

@Override
public void readFields(DataInput dataInput) throws IOException {
count = dataInput.readInt();
text = dataInput.readUTF();
}

public String getText() {
return text;
}

public int getCount() {
return count;
}

TextWithCountWritable(){}

TextWithCountWritable(String text, int count) {
this.text = text;
this.count = count;
}

@Override
public boolean equals(Object other) {
if (other == null){
return false;
}
if (other == this){
return true;
}
if (!(other instanceof TextWithCountWritable)){
return false;
}
TextWithCountWritable otherMyClass = (TextWithCountWritable)other;
if (otherMyClass.count != count){
return false;
}
if (!otherMyClass.text.equals(text)){
return false;
}
return true;
}

@Override
protected TextWithCountWritable clone() {
return new TextWithCountWritable(text, count);
}

@Override
public int compareTo(TextWithCountWritable o) {
if (equals(o)){
return 0;
}
int intCompare = Integer.compare(count, o.count);
return (intCompare == 0) ? Integer.compare(this.hashCode(), o.hashCode()) : -intCompare;
}
}
Loading