diff --git a/.gitignore b/.gitignore index e3b6fa2..b130b76 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .idea +build +.gradle /input /output* diff --git a/README.md b/README.md index 997652b..46e829b 100644 --- a/README.md +++ b/README.md @@ -23,5 +23,4 @@ From github.com:polis-mail-ru/2017-big-data Пример конфигурация для запуска таски из idea можно посмотреть в файле config.png -В своём Java package `` выполните все задания. - +В своём Java package `` выполните все задания. \ No newline at end of file diff --git a/answers.txt b/answers.txt deleted file mode 100644 index 00fa802..0000000 --- a/answers.txt +++ /dev/null @@ -1,3 +0,0 @@ -2. is 126420 -3. 41.602 -4. french 5742 diff --git a/build.gradle b/build.gradle new file mode 100644 index 0000000..98472ad --- /dev/null +++ b/build.gradle @@ -0,0 +1,27 @@ +group 'com.olerom' +version '1.0-SNAPSHOT' + +apply plugin: 'java' + +sourceCompatibility = 1.8 + +repositories { + mavenCentral() +} + +dependencies { + compile 'org.apache.hadoop:hadoop-mapreduce:2.7.0' + compile 'org.apache.hadoop:hadoop-core:0.20.2' + testCompile group: 'junit', name: 'junit', version: '4.12' +} + +task fatJar(type: Jar) { + manifest { + attributes 'Implementation-Title': 'Gradle Jar File Example', + 'Implementation-Version': version, + 'Main-Class': 'olerom.wordcount.WordCount' + } + baseName = project.name + '-fat-jar' + from { configurations.compile.collect { it.isDirectory() ? it : zipTree(it) } } + with jar +} \ No newline at end of file diff --git a/config.png b/config.png deleted file mode 100644 index 2ed2c3b..0000000 Binary files a/config.png and /dev/null differ diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000..f14ff33 Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..453667c --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Sun Oct 29 00:17:07 MSK 2017 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip diff --git a/gradlew b/gradlew new file mode 100755 index 0000000..4453cce --- /dev/null +++ b/gradlew @@ -0,0 +1,172 @@ +#!/usr/bin/env sh + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS="" + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn ( ) { + echo "$*" +} + +die ( ) { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin, switch paths to Windows format before running java +if $cygwin ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=$((i+1)) + done + case $i in + (0) set -- ;; + (1) set -- "$args0" ;; + (2) set -- "$args0" "$args1" ;; + (3) set -- "$args0" "$args1" "$args2" ;; + (4) set -- "$args0" "$args1" "$args2" "$args3" ;; + (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save ( ) { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=$(save "$@") + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong +if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then + cd "$(dirname "$0")" +fi + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 0000000..e95643d --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,84 @@ +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/measuring.txt b/measuring.txt new file mode 100644 index 0000000..45fe52f --- /dev/null +++ b/measuring.txt @@ -0,0 +1,2 @@ +WordCount без комбайнера: 2.5+ часа +WordCount с комбайнером: 1 час 42 минуты \ No newline at end of file diff --git a/pom.xml b/pom.xml deleted file mode 100644 index 16acaa5..0000000 --- a/pom.xml +++ /dev/null @@ -1,47 +0,0 @@ - - 4.0.0 - pritykovskaya - 2017-big-data - jar - 1.0-SNAPSHOT - 2017-big-data - http://maven.apache.org - - - UTF-8 - 2.6.0 - - - - - org.apache.hadoop - hadoop-client - ${hadoop.version} - - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - pritykovskaya.WordCount - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - 1.6 - 1.6 - - - - - diff --git a/settings.gradle b/settings.gradle new file mode 100644 index 0000000..429bcde --- /dev/null +++ b/settings.gradle @@ -0,0 +1,3 @@ +rootProject.name = 'big-data-course' +rootProject.name = 'big-data-course' + diff --git a/src/main/java/olerom/namecount/NameCount.java b/src/main/java/olerom/namecount/NameCount.java new file mode 100644 index 0000000..8c13192 --- /dev/null +++ b/src/main/java/olerom/namecount/NameCount.java @@ -0,0 +1,42 @@ +package olerom.namecount; + +import olerom.wordcount.WordCountReducer; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +/** + * Date: 30.10.17 + * + * @author olerom + */ +public class NameCount extends Configured implements Tool { + public int run(final String[] strings) throws Exception { + final Job job = new Job(getConf(), "olerom.namecount.NameCount"); + job.setJarByClass(getClass()); + + TextInputFormat.addInputPath(job, new Path(strings[0])); + job.setInputFormatClass(TextInputFormat.class); + + job.setMapperClass(NameCountMapper.class); + job.setReducerClass(NameCountReducer.class); + + TextOutputFormat.setOutputPath(job, new Path(strings[1])); + job.setOutputFormatClass(TextOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(IntWritable.class); + + return job.waitForCompletion(true) ? 0 : 1; + } + + public static void main(String[] args) throws Exception { + int exitCode = ToolRunner.run(new NameCount(), args); + System.exit(exitCode); + } +} \ No newline at end of file diff --git a/src/main/java/olerom/namecount/NameCountMapper.java b/src/main/java/olerom/namecount/NameCountMapper.java new file mode 100644 index 0000000..597e1e0 --- /dev/null +++ b/src/main/java/olerom/namecount/NameCountMapper.java @@ -0,0 +1,55 @@ +package olerom.namecount; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; + +import java.io.IOException; +import java.util.Random; +import java.util.StringTokenizer; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Date: 30.10.17 + * + * @author olerom + */ +public class NameCountMapper + extends Mapper { + + private final static IntWritable ONE = new IntWritable(1); + private final Text word = new Text(); + + @Override + protected void map(LongWritable key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer tokenizer = new StringTokenizer(value.toString()); + + while (tokenizer.hasMoreTokens()) { + final String token = tokenizer.nextToken(); + if (!Character.isDigit(token.charAt(0))) { + if (isName(token)) { + word.set(token); + context.write(word, ONE); + } else { + context.getCounter("groupName", token.toLowerCase()).increment(1); + } + } + } + } + + private boolean isName(final String token) { + if (token == null || token.length() == 0) { + return false; + } + + final char firstLetter = token.charAt(0); + final String otherLetters = token.substring(1, token.length()); + final Pattern pattern = Pattern.compile("[a-z0-9]+"); + final Matcher matcher = pattern.matcher(otherLetters); + + return Character.isUpperCase(firstLetter) && matcher.matches(); + } +} \ No newline at end of file diff --git a/src/main/java/olerom/namecount/NameCountReducer.java b/src/main/java/olerom/namecount/NameCountReducer.java new file mode 100644 index 0000000..57f44b3 --- /dev/null +++ b/src/main/java/olerom/namecount/NameCountReducer.java @@ -0,0 +1,35 @@ +package olerom.namecount; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Reducer; + +import java.io.IOException; + +/** + * Date: 13.11.17 + * + * @author olerom + */ +public class NameCountReducer extends Reducer { + + @Override + protected void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + + System.out.println(); + + int totalSum = 0; + for (IntWritable value : values) { + totalSum += value.get(); + } + + System.out.println("1: " + totalSum); + totalSum += Math.round(context.getCounter("groupName", key.toString().toLowerCase()).getValue() + / (totalSum * 0.005)); + System.out.println(context.getCounter("groupName", key.toString().toLowerCase()).getValue()); + System.out.println("2: " + totalSum); + + context.write(key, new IntWritable(totalSum)); + } +} \ No newline at end of file diff --git a/src/main/java/olerom/sortedwordcount/SortedWordCount.java b/src/main/java/olerom/sortedwordcount/SortedWordCount.java new file mode 100644 index 0000000..3911e5e --- /dev/null +++ b/src/main/java/olerom/sortedwordcount/SortedWordCount.java @@ -0,0 +1,71 @@ +package olerom.sortedwordcount; + +import olerom.wordcount.WordCount; +import olerom.wordcount.WordCountMapper; +import olerom.wordcount.WordCountReducer; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +/** + * Date: 30.10.17 + * + * @author olerom + */ +public class SortedWordCount extends Configured implements Tool { + public int run(final String[] strings) throws Exception { + final Job firstJob = new Job(getConf(), "Default WC"); + firstJob.setJarByClass(getClass()); + + TextInputFormat.addInputPath(firstJob, new Path(strings[0])); + firstJob.setInputFormatClass(TextInputFormat.class); + + firstJob.setMapperClass(WordCountMapper.class); + firstJob.setReducerClass(WordCountReducer.class); + + TextOutputFormat.setOutputPath(firstJob, new Path(strings[1])); + firstJob.setOutputFormatClass(TextOutputFormat.class); + firstJob.setOutputKeyClass(Text.class); + firstJob.setOutputValueClass(IntWritable.class); +// Second job + JobConf secondJobConf = new JobConf(WordCount.class); + secondJobConf.setJobName("Sorted WC"); + +// secondJobConf.setOutputKeyClass(Text.class); +// secondJobConf.setOutputValueClass(IntWritable.class); + secondJobConf.setOutputKeyClass(IntWritable.class); + secondJobConf.setOutputValueClass(Text.class); + + secondJobConf.setMapperClass(SortedWordCountMapper.class); + secondJobConf.setReducerClass(SortedWordCountReducer.class); + + secondJobConf.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class); + secondJobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class); + + System.out.println(strings[1] + "/part-00000"); + org.apache.hadoop.mapred.FileInputFormat.setInputPaths(secondJobConf, new Path(strings[1] + "/part-r-00000")); + org.apache.hadoop.mapred.FileOutputFormat.setOutputPath(secondJobConf, new Path(strings[1] + "/result")); + + final Job secondJob = new Job(secondJobConf); + + firstJob.submit(); + if (firstJob.waitForCompletion(true)) { + secondJob.submit(); + secondJob.waitForCompletion(true); + return 0; + } + return 1; + } + + public static void main(String[] args) throws Exception { + final int exitCode = ToolRunner.run(new SortedWordCount(), args); + System.exit(exitCode); + } +} \ No newline at end of file diff --git a/src/main/java/olerom/sortedwordcount/SortedWordCountMapper.java b/src/main/java/olerom/sortedwordcount/SortedWordCountMapper.java new file mode 100644 index 0000000..3f787d7 --- /dev/null +++ b/src/main/java/olerom/sortedwordcount/SortedWordCountMapper.java @@ -0,0 +1,44 @@ +package olerom.sortedwordcount; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.StringTokenizer; + +/** + * Date: 12.11.17 + * + * @author olerom + */ +public class SortedWordCountMapper extends MapReduceBase implements Mapper { + + public void map(Object key, Text value, OutputCollector collector, Reporter arg3) throws IOException { + String line = value.toString(); + StringTokenizer stringTokenizer = new StringTokenizer(line); + { + int number = -1; + String word = "empty"; + + if (stringTokenizer.hasMoreTokens()) { + String str0 = stringTokenizer.nextToken(); + word = str0.trim(); + } + + if (stringTokenizer.hasMoreElements()) { + String str1 = stringTokenizer.nextToken(); + number = (-1) * Integer.parseInt(str1.trim()); + } + if (number != 1) + collector.collect(new IntWritable(number), new Text(word)); + } + + } +} \ No newline at end of file diff --git a/src/main/java/olerom/sortedwordcount/SortedWordCountReducer.java b/src/main/java/olerom/sortedwordcount/SortedWordCountReducer.java new file mode 100644 index 0000000..e9e7142 --- /dev/null +++ b/src/main/java/olerom/sortedwordcount/SortedWordCountReducer.java @@ -0,0 +1,29 @@ +package olerom.sortedwordcount; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.MapReduceBase; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reducer; +import org.apache.hadoop.mapred.Reporter; + +import java.io.IOException; +import java.util.Iterator; + +/** + * Date: 30.10.17 + * + * @author olerom + */ +public class SortedWordCountReducer extends MapReduceBase implements Reducer { + + public void reduce(IntWritable key, + Iterator values, + OutputCollector collector, + Reporter reporter) throws IOException { + while ((values.hasNext())) { + collector.collect(values.next(), new IntWritable(-1 * key.get())); + } + } + +} \ No newline at end of file diff --git a/src/main/java/olerom/stopwordcount/StopWordCount.java b/src/main/java/olerom/stopwordcount/StopWordCount.java new file mode 100644 index 0000000..03c8004 --- /dev/null +++ b/src/main/java/olerom/stopwordcount/StopWordCount.java @@ -0,0 +1,65 @@ +package olerom.stopwordcount; + +import olerom.wordcount.WordCountReducer; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Counters; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; + +/** + * Date: 12.11.17 + * + * @author olerom + */ +public class StopWordCount extends Configured implements Tool { + public int run(final String[] strings) throws Exception { + Configuration configuration = new Configuration(); + configuration.set("stopWords", strings[2]); + + final Job job = new Job(configuration, "Stop words"); +// final Job job = new Job(getConf(), "Stop words"); + + job.setJarByClass(getClass()); + + TextInputFormat.addInputPath(job, new Path(strings[0])); + job.setInputFormatClass(TextInputFormat.class); + + job.setMapperClass(StopWordCountMapper.class); + + TextOutputFormat.setOutputPath(job, new Path(strings[1])); + job.setOutputFormatClass(TextOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(IntWritable.class); + + int exit = job.waitForCompletion(true) ? 0 : 1; + Counters counter = job.getCounters(); + + System.out.println("Stop words: " + counter.findCounter(StopWordCountMapper.MATCH_COUNTER.STOP_WORD).getValue()); + System.out.println("Total: " + counter.findCounter(StopWordCountMapper.MATCH_COUNTER.TOTAL).getValue()); + System.out.println("Percentage: " + + ((double) counter.findCounter(StopWordCountMapper.MATCH_COUNTER.STOP_WORD).getValue() + / counter.findCounter(StopWordCountMapper.MATCH_COUNTER.TOTAL).getValue()) + ); + + return exit; + } + + public static void main(String[] args) throws Exception { + final int exitCode = ToolRunner.run(new StopWordCount(), args); + System.exit(exitCode); + } +} diff --git a/src/main/java/olerom/stopwordcount/StopWordCountMapper.java b/src/main/java/olerom/stopwordcount/StopWordCountMapper.java new file mode 100644 index 0000000..45e7845 --- /dev/null +++ b/src/main/java/olerom/stopwordcount/StopWordCountMapper.java @@ -0,0 +1,60 @@ +package olerom.stopwordcount; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.StringTokenizer; + +/** + * Date: 12.11.17 + * + * @author olerom + */ +public class StopWordCountMapper + extends Mapper { + private final HashSet stopWords = new HashSet<>(); + + @Override + protected void map(LongWritable key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer tokenizer = new StringTokenizer(value.toString()); + + shittySetup(context.getConfiguration().get("stopWords")); + + while (tokenizer.hasMoreTokens()) { + String trim = tokenizer.nextToken().trim(); + context.getCounter(MATCH_COUNTER.TOTAL).increment(1); + if (stopWords.contains(trim)) { + System.out.println("Contains: " + trim); + context.getCounter(MATCH_COUNTER.STOP_WORD).increment(1); + } + } + } + + public enum MATCH_COUNTER { + STOP_WORD, + TOTAL + } + + + private void shittySetup(String path) { + BufferedReader br = null; + FileReader fr = null; + try { + fr = new FileReader(path); + br = new BufferedReader(fr); + String sCurrentLine; + while ((sCurrentLine = br.readLine()) != null) { + stopWords.add(sCurrentLine); + } + } catch (IOException e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/src/main/java/olerom/wordcount/WordCount.java b/src/main/java/olerom/wordcount/WordCount.java new file mode 100644 index 0000000..24eb06e --- /dev/null +++ b/src/main/java/olerom/wordcount/WordCount.java @@ -0,0 +1,37 @@ +package olerom.wordcount; + +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; + +public class WordCount extends Configured implements Tool { + public int run(final String[] strings) throws Exception { + final Job job = new Job(getConf(), "olerom.wordcount.WordCount"); + job.setJarByClass(getClass()); + + TextInputFormat.addInputPath(job, new Path(strings[0])); + job.setInputFormatClass(TextInputFormat.class); + + job.setMapperClass(WordCountMapper.class); + job.setReducerClass(WordCountReducer.class); + job.setCombinerClass(WordCountReducer.class); + + TextOutputFormat.setOutputPath(job, new Path(strings[1])); + job.setOutputFormatClass(TextOutputFormat.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(IntWritable.class); + + return job.waitForCompletion(true) ? 0 : 1; + } + + public static void main(String[] args) throws Exception { + final int exitCode = ToolRunner.run(new WordCount(), args); + System.exit(exitCode); + } +} \ No newline at end of file diff --git a/src/main/java/olerom/wordcount/WordCountMapper.java b/src/main/java/olerom/wordcount/WordCountMapper.java new file mode 100644 index 0000000..9932e35 --- /dev/null +++ b/src/main/java/olerom/wordcount/WordCountMapper.java @@ -0,0 +1,27 @@ +package olerom.wordcount; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; + +import java.io.IOException; +import java.util.StringTokenizer; + +public class WordCountMapper + extends Mapper { + + private final static IntWritable ONE = new IntWritable(1); + private final Text word = new Text(); + + @Override + protected void map(LongWritable key, Text value, Context context) + throws IOException, InterruptedException { + StringTokenizer tokenizer = new StringTokenizer(value.toString()); + + while (tokenizer.hasMoreTokens()) { + word.set(tokenizer.nextToken().trim()); + context.write(word, ONE); + } + } +} \ No newline at end of file diff --git a/src/main/java/olerom/wordcount/WordCountReducer.java b/src/main/java/olerom/wordcount/WordCountReducer.java new file mode 100644 index 0000000..3779ae3 --- /dev/null +++ b/src/main/java/olerom/wordcount/WordCountReducer.java @@ -0,0 +1,26 @@ +package olerom.wordcount; + +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Reducer; + +import java.io.IOException; + +public class WordCountReducer + extends Reducer { + + private IntWritable val = new IntWritable(); + + @Override + protected void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { + + int totalSum = 0; + for (IntWritable value : values) { + totalSum += value.get(); + } + val.set(totalSum); + + context.write(key, val); + } +} \ No newline at end of file diff --git a/src/main/java/pritykovskaya/WordCount.java b/src/main/java/pritykovskaya/WordCount.java deleted file mode 100644 index b047511..0000000 --- a/src/main/java/pritykovskaya/WordCount.java +++ /dev/null @@ -1,80 +0,0 @@ -package pritykovskaya; - - -import java.io.IOException; -import java.util.StringTokenizer; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; - - -public class WordCount extends Configured implements Tool { - - public static class MyMapper extends Mapper { - private static final IntWritable ONE = new IntWritable(1); - private final transient Text word = new Text(); - - @Override public void map(final LongWritable key, final Text value, final Context context) - throws IOException, InterruptedException { - final String line = value.toString(); - final StringTokenizer tokenizer = new StringTokenizer(line); - while (tokenizer.hasMoreTokens()) { - word.set(tokenizer.nextToken()); - context.write(word, ONE); - } - } - } - - - public static class MyReducer extends Reducer { - - @Override - public void reduce(final Text key, final Iterable values, final Context context) - throws IOException, InterruptedException { - int sum = 0; - for (final IntWritable val : values) { - sum += val.get(); - } - context.write(key, new IntWritable(sum)); - } - } - - - @Override public int run(final String[] args) throws Exception { - final Configuration conf = this.getConf(); - final Job job = Job.getInstance(conf, "Word Count"); - job.setJarByClass(WordCount.class); - - job.setMapperClass(MyMapper.class); - job.setReducerClass(MyReducer.class); - - job.setOutputKeyClass(Text.class); - job.setOutputValueClass(IntWritable.class); - - job.setInputFormatClass(TextInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - - FileInputFormat.addInputPath(job, new Path(args[0])); - FileOutputFormat.setOutputPath(job, new Path(args[1])); - - return job.waitForCompletion(true) ? 0 : 1; - } - - public static void main(final String[] args) throws Exception { - final int returnCode = ToolRunner.run(new Configuration(), new WordCount(), args); - System.exit(returnCode); - } -}