Ehm, this should improve stuff.

bcleenders · Jan 12, 2013 · b7957b0 · b7957b0
1 parent 013bbaa
commit b7957b0
Show file tree

Hide file tree

Showing 15 changed files with 57 additions and 286 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 build
 dist
 *~
+_*
diff --git a/README.md b/README.md
@@ -1,70 +1,6 @@
 naward09
 ========
 
-A collection of scripts. Some really useless, others even more.
+A collection of scripts used for estimating the Shannon limit of language as used on the internet.
 
-Or so it may seem... It might just turn out to work and even be correct!
-
-##TODO's
-* Run on big data set
-* Generate some fancy graphs?
-
-
----
-##The plan...
-
-###Run 1 - tuple gathering
-Mapper; send N character sets
-
-    var inputStream,
-		i = 0,
-		buffer = char[3],
-		ONE = new LongWritable(1L);
-
-	while(inputStream.hasNext()) {
-		buffer[i] = inputStream.next();
-		i = (i + 1) % 3;
-		buffer[i] = inputStream.next();
-		emit( [ buffer[i], buffer[(i+1) % 3], buffer[(i+2) % 3] ], ONE);
-	}
-
-Combiner/Reducer; sum the sets to a \[tuple, occurences\]
-(single set)
-
-###Run 2 - calc total
-A second run, over the output of the first run. This transforms a large dictionary into three values, two of which are required to calculate the entropy.
-
-About the output;
-LOG X	The sum of the o*ln(o) where o is the amount of occurrences of a certain combination.
-TOTAL Y	The sum of all the o
-DIFF Z	The amount of combinations (#o)
-
-###Run 3 - calculating the entropy
-Continues where run 2 stopped.
-
-ln(TOTAL) - (LOG/TOTAL)
-
-LOG and TOTAL being the respective output of shanCalc
-
----
-## NGramCount
-
-The last N input bytes are encoded in an int (or long). After reading position x of the input, the 32-bits int is composed like this:
-
-    31-30 | 29-25 | 24-20 | 19-15 | 14-10 | 9-5 | 4-0 
-     00      x-5     x-4     x-3     x-2    x-1    x
-
-Unused bits are zero, e.g. N=4:
-
-    31-30 | 29-25 | 24-20 | 19-15 | 14-10 | 9-5 | 4-0 
-     00     00000   00000    x-3     x-2    x-1    x
-
-A space, uppercase and lowercase letters are encoded with their 5 least significant bits in ASCII: SPACE=0, a=A=1, z=Z=26.
-
-    Char  | Hex  | Binary
-    ------|------|----------
-    Space | 0x20 | 001 00000
-        A | 0x41 | 010 00001
-        a | 0x61 | 011 00001
-        Z | 0x5a | 010 11010
-        z | 0x7a | 011 11010
+Please visit the [wiki](https://github.com/norvigaward/naward09/wiki) for the documentation!
diff --git a/src/charCount/CharCount.java b/src/charCount/CharCount.java
@@ -40,7 +40,7 @@ public int run(String[] args) throws Exception {
 		FileInputFormat.setInputPathFilter(job, SimpleInputFilter.class);
 		SimpleInputFilter.setFilter("textData");
 		Path inputPath = new Path("/data/public/common-crawl/parse-output/segment/*/*");
-		Path outputPath = new Path("/user/naward09/FULL_RUN_DO_NOT_REMOVE_ME/");
+		Path outputPath = new Path("/user/naward09/CharCount_output/");
 		FileInputFormat.addInputPath(job, inputPath);
 		FileOutputFormat.setOutputPath(job, outputPath);
 

diff --git a/src/charCount/CharMapper.java b/src/charCount/CharMapper.java
@@ -4,12 +4,27 @@
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 
+/*
+* Transforms a Text input to zero or more strings of length 8 as KEY, with value 1L.
+* Every possible substring of this text of length 8 (illegal characters being ignored) is emitted as many times as it occurs.
+*/
 public class CharMapper<KEY> extends Mapper<KEY, Text, Text, LongWritable> {
+	// ~ Size of the output Strings
 	private static final int SIZE = 8;
+
+	// ~ cycle[n] is only valid for n <= MAX_INDEX
 	private static final int MAX_INDEX = SIZE - 1;
+
+	// ~ The output value
 	private static final LongWritable ONE = new LongWritable(1L);
+
+	// ~ Used to transform the char arrays to Strings, and then to Text's
 	private StringBuilder builder = new StringBuilder(SIZE);
+
+	// ~ Initialized only once, then set() is called every time it is written to the Context
 	private Text txt = new Text();
+
+	// ~ The char array used to store the combinations. It acts like a cycled buffer.
 	private char[] cycle = new char[SIZE];
 
 	public void map(KEY key, Text val, Context context) {
@@ -22,8 +37,9 @@ public void map(KEY key, Text val, Context context) {
 			// ~ Current position in input line
 			int j;
 
-			// ~ Fill the buffer
+			// ~ The number of illegal characters found so far.
 			int useless = 0;
+			// ~ Fill the buffer, before starting to submit anything (it's only half-full)
 			for (j = 0; (j < MAX_INDEX + useless) && (j < line.length()); j++) {
 				char currChar = Character.toLowerCase(line.charAt(j));
 
@@ -36,14 +52,14 @@ public void map(KEY key, Text val, Context context) {
 				}
 			}
 
-			// ~ Start looping through the rest of the string
+			// ~ Start looping through the rest of the string, start at position j
 			for (; j < line.length(); j++) {
 				char currChar = Character.toLowerCase(line.charAt(j));
 				if (isValid(currChar)) {
 					i = i % SIZE;
 					this.cycle[i] = currChar;
 
-					// Build the output from the cycle
+					// ~ Build the output from the cycle
 					builder.delete(0, SIZE);
 					if (i < MAX_INDEX) {
 						builder.append(cycle, i + 1, MAX_INDEX - i);
@@ -53,6 +69,7 @@ public void map(KEY key, Text val, Context context) {
 					i++;
 
 					try {
+						// ~ Done; write it to the Context, and start over again :)
 						txt.set(builder.toString());
 						context.write(txt, ONE);
 					} catch (Exception e) {
@@ -62,7 +79,15 @@ public void map(KEY key, Text val, Context context) {
 		}
 	}
 
+	/*
+	* Returns true if this is a valid char, thus whether it is a lowercase a-z or a space.
+	*/
 	private boolean isValid(char c) {
+		/*
+		* Oracle specifically advises against this, but it works and it's fast!
+		* We'll just take the incompatibility with Japanese for granted... Just remember to change it if you 
+		* want to work with really, really weird characters!
+		*/
 		return ('a' <= c && c <= 'z') || c == ' ';
 	}
 }
diff --git a/src/charCount/README.md b/src/charCount/README.md
@@ -5,4 +5,6 @@ This program counts the occurrences of sets of characters, much like a word coun
 
 Once it's done, it sums everything up and dumps the [char set][#occurrences] combinations in a file.
 
-The length of the sets of characters is variable, and only the alphabet and spaces (/[a-zA-Z ]{N}/) are accounted for!
+The length of the sets of characters is variable (it can be changed in the CharMapper), and only the alphabet and spaces (/[a-zA-Z ]{N}/) are accounted for!
+
+*Beware* of the output... it gets pretty big for large N's (such as N=8)
diff --git a/src/charCount/result b/src/charCount/result
diff --git a/src/shannon/DoubleSumReducer.java b/src/shannon/DoubleSumReducer.java
diff --git a/src/shannon/README.md b/src/shannon/README.md
diff --git a/src/shannon/Run.java b/src/shannon/Run.java
diff --git a/src/shannon/ShanCalc.java b/src/shannon/ShanCalc.java
diff --git a/src/shannon/ShanMap.java b/src/shannon/ShanMap.java
diff --git a/src/shannon/SimpleInputFilter.java b/src/shannon/SimpleInputFilter.java
-Original file line number
+Diff line change
@@ -1,3 +1,4 @@
     build
     dist
     *~
+    _*