Merge branch 'release/0.10.0'

twitter · May 11, 2015 · 0a3c320 · 0a3c320
2 parents 091f735 + 32e26b5
commit 0a3c320
Show file tree

Hide file tree

Showing 82 changed files with 3,164 additions and 718 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,5 +1,10 @@
 language: scala
 sudo: false
-scala:
-  - 2.10.4
-  - 2.11.4
+matrix:
+  include:
+    - scala: 2.10.4
+      script: ./sbt ++$TRAVIS_SCALA_VERSION clean test
+
+    - scala: 2.11.5
+      script: ./sbt ++$TRAVIS_SCALA_VERSION clean test
+      after_success: "./sbt coveralls"
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,27 @@
 # Algebird #
 
+### Version 0.10.0 ###
+* EventuallyAggregator and variants #407
+* Add MultiAggregator.apply #408
+* Return a MonoidAggregator from MultiAggregator when possible #409
+* Add SummingWithHitsCache class to also track key hits. #410
+* Add MapAggregator to compose tuples of (key, agg) pairs #411
+* fix README.md. 2.9.3 no longer published #412
+* Add Coveralls Badge to the README #413
+* Add some combinators on MonoidAggregator #417
+* Added function to safely downsize a HyperLogLog sketch #418
+* AdaptiveCache #419
+* fix property tests #421
+* Make Preparer extend Serializable #422
+* Make MutableBackedMap Serializable. #424
+* A couple of performance optimizations: HyperLogLog and BloomFilter #426
+* Adds a presenting benchmark and optimizes it #427
+* Fixed broken links in README #428
+* Speed up QTree #433
+* Moments returns NaN when count is too low for higher order statistics #434
+* Add HLL method to do error-based Aggregator #438
+* Bump bijection to 0.8.0 #441
+
 ### Version 0.9.0 ###
 * Replace mapValues with one single map to avoid serialization in frameworks like Spark. #344
 * Add Fold trait for composable incremental processing (for develop) #350

diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
-## Algebird [![Build Status](https://secure.travis-ci.org/twitter/algebird.png)](http://travis-ci.org/twitter/algebird)
+## Algebird [![Build status](https://img.shields.io/travis/twitter/algebird/develop.svg)](http://travis-ci.org/twitter/algebird) [![Coverage status](https://img.shields.io/coveralls/twitter/algebird/develop.svg)](https://coveralls.io/r/twitter/algebird?branch=develop)
+
 
 Abstract algebra for Scala. This code is targeted at building aggregation systems (via [Scalding](https://github.com/twitter/scalding) or [Storm](https://github.com/nathanmarz/storm)). It was originally developed as part of Scalding's Matrix API, where Matrices had values which are elements of Monoids, Groups, or Rings. Subsequently, it was clear that the code had broader application within Scalding and on other projects within Twitter.
 
@@ -47,17 +48,17 @@ Discussion occurs primarily on the [Algebird mailing list](https://groups.google
 
 ## Maven
 
-Algebird modules are available on maven central. The current groupid and version for all modules is, respectively, `"com.twitter"` and  `0.9.0`.
+Algebird modules are available on maven central. The current groupid and version for all modules is, respectively, `"com.twitter"` and  `0.10.0`.
 
 Current published artifacts are
 
-* `algebird-core_2.9.3`
+* `algebird-core_2.11`
 * `algebird-core_2.10`
-* `algebird-test_2.9.3`
+* `algebird-test_2.11`
 * `algebird-test_2.10`
-* `algebird-util_2.9.3`
+* `algebird-util_2.11`
 * `algebird-util_2.10`
-* `algebird-bijection_2.9.3`
+* `algebird-bijection_2.11`
 * `algebird-bijection_2.10`
 
 The suffix denotes the scala version.
@@ -68,7 +69,7 @@ The suffix denotes the scala version.
 We didn't know about it when we started this code, but it seems like we're more focused on
 large scale analytics.
 
-> Why not use Scalaz's [Monoid](https://github.com/scalaz/scalaz/blob/master/core/src/main/scala/scalaz/Monoid.scala) trait?
+> Why not use Scalaz's [Monoid](http://docs.typelevel.org/api/scalaz/stable/7.0.4/doc/#scalaz.Monoid) trait?
 
 The answer is a mix of the following:
 * The trait itself is tiny, we just need zero and plus, it is the implementations for all the types that are important. We wrote a code generator to derive instances for all the tuples, and by hand wrote monoids for List, Set, Option, Map, and several other objects used for counting (DecayedValue for exponential decay, AveragedValue for averaging, HyperLogLog for approximate cardinality counting). It's the instances that are useful in scalding and elsewhere.
@@ -82,7 +83,7 @@ The answer is a mix of the following:
 * Avi Bryant <http://twitter.com/avibryant>
 * Edwin Chen <http://twitter.com/echen>
 * ellchow <http://github.com/ellchow>
-* Mike Gagnon <https://twitter.com/MichaelNGagnon>
+* Mike Gagnon <https://twitter.com/gmike>
 * Moses Nakamura <https://twitter.com/mnnakamura>
 * Steven Noble <http://twitter.com/snoble>
 * Sam Ritchie <http://twitter.com/sritchie>

diff --git a/algebird-bijection/src/test/scala/com/twitter/algebird/bijection/AlgebirdBijectionLaws.scala b/algebird-bijection/src/test/scala/com/twitter/algebird/bijection/AlgebirdBijectionLaws.scala
@@ -16,11 +16,9 @@
 
 package com.twitter.algebird.bijection
 
-import org.scalatest.{ PropSpec, Matchers }
-import org.scalatest.prop.PropertyChecks
-import org.scalacheck.{ Arbitrary, Properties }
+import com.twitter.algebird.CheckProperties
 
-class AlgebirdBijectionLaws extends PropSpec with PropertyChecks with Matchers {
+class AlgebirdBijectionLaws extends CheckProperties {
   // TODO: Fill in tests. Ideally we'd publish an algebird-testing
   // module before merging this in.
 }
diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/AsyncSummerBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/AsyncSummerBenchmark.scala
@@ -4,11 +4,11 @@ import java.lang.Math._
 import java.util.concurrent.Executors
 import java.util.concurrent.atomic.AtomicLong
 
-import com.google.caliper.{Param, SimpleBenchmark}
-import com.twitter.algebird.{HyperLogLogMonoid, _}
+import com.google.caliper.{ Param, SimpleBenchmark }
+import com.twitter.algebird.{ HyperLogLogMonoid, _ }
 import com.twitter.algebird.util.summer._
 import com.twitter.bijection._
-import com.twitter.util.{Await, Duration, FuturePool}
+import com.twitter.util.{ Await, Duration, FuturePool }
 
 import scala.util.Random
 

diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSBenchmark.scala
@@ -1,7 +1,7 @@
 package com.twitter.algebird.caliper
 
 import com.google.caliper.{ Param, SimpleBenchmark }
-import com.twitter.algebird.{ TopPctCMS, TopCMS, CMSHasherImplicits, TopPctCMSMonoid }
+import com.twitter.algebird.{ TopPctCMS, CMSHasherImplicits, TopPctCMSMonoid }
 
 /**
  * Benchmarks the Count-Min sketch implementation in Algebird.
@@ -14,6 +14,9 @@ import com.twitter.algebird.{ TopPctCMS, TopCMS, CMSHasherImplicits, TopPctCMSMo
 //     - Annotate `timePlus` with `@MacroBenchmark`.
 class CMSBenchmark extends SimpleBenchmark {
 
+  val Seed = 1
+  val JavaCharSizeInBits = 2 * 8
+
   @Param(Array("0.1", "0.005"))
   val eps: Double = 0.0
 
@@ -32,22 +35,24 @@ class CMSBenchmark extends SimpleBenchmark {
   var random: scala.util.Random = _
   var cmsLongMonoid: TopPctCMSMonoid[Long] = _
   var cmsBigIntMonoid: TopPctCMSMonoid[BigInt] = _
+  var cmsStringMonoid: TopPctCMSMonoid[String] = _
+  var inputsBigInt: Seq[BigInt] = _
+  var inputsString: Seq[String] = _
 
-  override def setUp {
+  override def setUp() {
     // Required import of implicit values (e.g. for BigInt- or Long-backed CMS instances)
     import CMSHasherImplicits._
 
-    cmsLongMonoid = {
-      val seed = 1
-      TopPctCMS.monoid[Long](eps, delta, seed, heavyHittersPct)
-    }
-
-    cmsBigIntMonoid = {
-      val seed = 1
-      TopPctCMS.monoid[BigInt](eps, delta, seed, heavyHittersPct)
-    }
+    cmsLongMonoid = TopPctCMS.monoid[Long](eps, delta, Seed, heavyHittersPct)
+    cmsBigIntMonoid = TopPctCMS.monoid[BigInt](eps, delta, Seed, heavyHittersPct)
+    cmsStringMonoid = TopPctCMS.monoid[String](eps, delta, Seed, heavyHittersPct)
 
     random = new scala.util.Random
+
+    inputsString = (0 to operations).map { i => random.nextString(maxBits / JavaCharSizeInBits) }.toSeq
+    Console.out.println(s"Created ${inputsString.size} input records for String")
+    inputsBigInt = inputsString.map { s => BigInt(s.getBytes) }
+    Console.out.println(s"Created ${inputsBigInt.size} input records for BigInt")
   }
 
   // Case A (K=Long): We count the first hundred integers, i.e. [1, 100]
@@ -70,14 +75,21 @@ class CMSBenchmark extends SimpleBenchmark {
     dummy
   }
 
-  // Case B.2 (K=BigInt): We draw numbers randomly from a 2^maxBits address space
+  // Case B.2 (K=BigInt): We count numbers drawn randomly from a 2^maxBits address space
   def timePlusOfRandom2048BitNumbersWithBigIntCms(reps: Int): Int = {
     var dummy = 0
     while (dummy < reps) {
-      (1 to operations).view.foldLeft(cmsBigIntMonoid.zero)((l, r) => {
-        val n = scala.math.BigInt(maxBits, random)
-        l ++ cmsBigIntMonoid.create(n)
-      })
+      inputsBigInt.view.foldLeft(cmsBigIntMonoid.zero)((l, r) => l ++ cmsBigIntMonoid.create(r))
+      dummy += 1
+    }
+    dummy
+  }
+
+  // Case C (K=String): We count strings drawn randomly from a 2^maxBits address space
+  def timePlusOfRandom2048BitNumbersWithStringCms(reps: Int): Int = {
+    var dummy = 0
+    while (dummy < reps) {
+      inputsString.view.foldLeft(cmsStringMonoid.zero)((l, r) => l ++ cmsStringMonoid.create(r))
       dummy += 1
     }
     dummy

diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSHashingBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/CMSHashingBenchmark.scala
@@ -1,6 +1,6 @@
 package com.twitter.algebird.caliper
 
-import com.google.caliper.{Param, SimpleBenchmark}
+import com.google.caliper.{ Param, SimpleBenchmark }
 
 /**
  * Benchmarks the hashing algorithms used by Count-Min sketch for CMS[BigInt].
@@ -33,7 +33,7 @@ class CMSHashingBenchmark extends SimpleBenchmark {
   /**
    * Width of the counting table.
    */
-  @Param(Array("11" /* eps = 0.271 */ , "544" /* eps = 0.005 */ , "2719" /* eps = 1E-3 */ , "271829" /* eps = 1E-5 */))
+  @Param(Array("11" /* eps = 0.271 */ , "544" /* eps = 0.005 */ , "2719" /* eps = 1E-3 */ , "271829" /* eps = 1E-5 */ ))
   val width: Int = 0
 
   /**
@@ -54,7 +54,7 @@ class CMSHashingBenchmark extends SimpleBenchmark {
   override def setUp() {
     random = new scala.util.Random
     // We draw numbers randomly from a 2^maxBits address space.
-    inputs = (1 to operations).view.map { _ => scala.math.BigInt(maxBits, random)}
+    inputs = (1 to operations).view.map { _ => scala.math.BigInt(maxBits, random) }
   }
 
   private def murmurHashScala(a: Int, b: Int, width: Int)(x: BigInt) = {
@@ -82,7 +82,7 @@ class CMSHashingBenchmark extends SimpleBenchmark {
   def timeBrokenCurrentHashWithRandomMaxBitsNumbers(operations: Int): Int = {
     var dummy = 0
     while (dummy < operations) {
-      inputs.foreach { input => brokenCurrentHash(a, b, width)(input)}
+      inputs.foreach { input => brokenCurrentHash(a, b, width)(input) }
       dummy += 1
     }
     dummy
@@ -91,7 +91,7 @@ class CMSHashingBenchmark extends SimpleBenchmark {
   def timeMurmurHashScalaWithRandomMaxBitsNumbers(operations: Int): Int = {
     var dummy = 0
     while (dummy < operations) {
-      inputs.foreach { input => murmurHashScala(a, b, width)(input)}
+      inputs.foreach { input => murmurHashScala(a, b, width)(input) }
       dummy += 1
     }
     dummy

diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/HLLBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/HLLBenchmark.scala
@@ -7,7 +7,6 @@ import com.twitter.bijection._
 import java.util.concurrent.Executors
 import com.twitter.algebird.util._
 import com.google.caliper.{ Param, SimpleBenchmark }
-import com.twitter.algebird.HyperLogLogMonoid
 import java.nio.ByteBuffer
 
 import scala.math._
@@ -19,7 +18,7 @@ class OldMonoid(bits: Int) extends HyperLogLogMonoid(bits) {
     else {
       val buffer = new Array[Byte](size)
       items.foreach { _.updateInto(buffer) }
-      Some(DenseHLL(bits, buffer.toIndexedSeq))
+      Some(DenseHLL(bits, Bytes(buffer)))
     }
 }
 

diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/HLLPresentBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/HLLPresentBenchmark.scala
@@ -0,0 +1,42 @@
+package com.twitter.algebird.caliper
+
+import com.google.caliper.{ SimpleBenchmark, Param }
+import com.twitter.algebird.{ HyperLogLogMonoid, HLL }
+import com.twitter.bijection._
+import java.nio.ByteBuffer
+
+class HLLPresentBenchmark extends SimpleBenchmark {
+  @Param(Array("5", "10", "17", "20"))
+  val bits: Int = 0
+
+  @Param(Array("10", "100", "500", "1000", "10000"))
+  val max: Int = 0
+
+  @Param(Array("10", "20", "100"))
+  val numHLL: Int = 0
+
+  var data: IndexedSeq[HLL] = _
+
+  implicit val byteEncoder = implicitly[Injection[Long, Array[Byte]]]
+
+  override def setUp {
+    val hllMonoid = new HyperLogLogMonoid(bits)
+    val r = new scala.util.Random(12345L)
+    data = (0 until numHLL).map { _ =>
+      val input = (0 until max).map(_ => r.nextLong).toSet
+      hllMonoid.batchCreate(input)(byteEncoder.toFunction)
+    }.toIndexedSeq
+
+  }
+
+  def timeBatchCreate(reps: Int): Int = {
+    var dummy = 0
+    while (dummy < reps) {
+      data.foreach { hll =>
+        hll.approximateSize
+      }
+      dummy += 1
+    }
+    dummy
+  }
+}
diff --git a/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/QTreeBenchmark.scala b/algebird-caliper/src/test/scala/com/twitter/algebird/caliper/QTreeBenchmark.scala
@@ -0,0 +1,61 @@
+package com.twitter.algebird.caliper
+
+import com.twitter.algebird._
+import scala.util.Random
+import com.twitter.bijection._
+
+import java.util.concurrent.Executors
+import com.twitter.algebird.util._
+import com.google.caliper.{ Param, SimpleBenchmark }
+import java.nio.ByteBuffer
+
+import scala.math._
+
+class OldQTreeSemigroup[A: Monoid](k: Int) extends QTreeSemigroup[A](k) {
+  override def sumOption(items: TraversableOnce[QTree[A]]) =
+    if (items.isEmpty) None
+    else Some(items.reduce(plus))
+}
+
+class QTreeBenchmark extends SimpleBenchmark {
+  var qtree: QTreeSemigroup[Long] = _
+  var oldqtree: QTreeSemigroup[Long] = _
+
+  @Param(Array("5", "10", "12"))
+  val depthK: Int = 0
+
+  @Param(Array("100", "10000"))
+  val numElements: Int = 0
+
+  var inputData: Seq[QTree[Long]] = _
+
+  override def setUp {
+    qtree = new QTreeSemigroup[Long](depthK)
+    oldqtree = new OldQTreeSemigroup(depthK)
+
+    val rng = new Random("qtree".hashCode)
+
+    inputData = (0L until numElements).map { _ =>
+      QTree(rng.nextInt(1000).toLong)
+    }
+  }
+
+  def timeSumOption(reps: Int): Int = {
+    var dummy = 0
+    while (dummy < reps) {
+      qtree.sumOption(inputData)
+      dummy += 1
+    }
+    dummy
+  }
+
+  /*
+  def timeOldSumOption(reps: Int): Int = {
+    var dummy = 0
+    while (dummy < reps) {
+      oldqtree.sumOption(inputData)
+      dummy += 1
+    }
+    dummy
+  } */
+}