Added facet benchmark that chooses random words as labels (mikemccand…

…#144)
iverase · Nov 17, 2021 · fec91aa · fec91aa
1 parent 0550148
commit fec91aa
Show file tree

Hide file tree

Showing 11 changed files with 203 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,9 @@
 *.ipr
 *.iws
 
+## Java flight recording
+*.jfr
+
 ## project files
 .project
 .classpath

diff --git a/src/main/perf/LineFileDocs.java b/src/main/perf/LineFileDocs.java
@@ -215,28 +215,35 @@ private void open() throws IOException {
       reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
       String firstLine = reader.readLine();
       if (firstLine.startsWith("FIELDS_HEADER_INDICATOR")) {
-        if (!firstLine.startsWith("FIELDS_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody") &&
-            !firstLine.startsWith("FIELDS_HEADER_INDICATOR###\ttitle\ttimestamp\ttext")) {
+        int defaultFieldLength = 4;
+        if (firstLine.startsWith("FIELDS_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody") == false &&
+            firstLine.startsWith("FIELDS_HEADER_INDICATOR###\ttitle\ttimestamp\ttext") == false &&
+            firstLine.startsWith("FIELD_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody\tRandomLabel") == false) {
           throw new IllegalArgumentException("unrecognized header in line docs file: " + firstLine.trim());
         }
+        if (firstLine.startsWith("FIELDS_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody\tRandomLabel")) {
+          defaultFieldLength = 5;
+        }
         if (facetFields.isEmpty() == false) {
           String[] fields = firstLine.split("\t");
-          if (fields.length > 4) {
-            extraFacetFields = Arrays.copyOfRange(fields, 4, fields.length);
+          if (fields.length > defaultFieldLength) {
+            extraFacetFields = Arrays.copyOfRange(fields, defaultFieldLength, fields.length);
             System.out.println("Additional facet fields: " + Arrays.toString(extraFacetFields));
 
             List<String> extraFacetFieldsList = Arrays.asList(extraFacetFields);
 
             // Verify facet fields now:
             for(String field : facetFields.keySet()) {
-              if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false && !extraFacetFieldsList.contains(field)) {
+              if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false
+                      && field.equals("RandomLabel") == false && extraFacetFieldsList.contains(field) == false) {
                 throw new IllegalArgumentException("facet field \"" + field + "\" is not recognized");
               }
             }
           } else {
             // Verify facet fields now:
             for(String field : facetFields.keySet()) {
-              if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false) {
+              if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false
+                      && field.equals("RandomLabel") == false) {
                 throw new IllegalArgumentException("facet field \"" + field + "\" is not recognized");
               }
             }
@@ -339,6 +346,8 @@ public static final class DocState {
     final Field id;
     final Field idPoint;
     final Field date;
+    final Field randomLabel;
+
     //final NumericDocValuesField dateMSec;
     //final LongField rand;
     final Field timeSec;
@@ -409,6 +418,9 @@ public static final class DocState {
       body = new Field("body", "", bodyFieldType);
       doc.add(body);
 
+      randomLabel = new Field("randomLabel", "", StringField.TYPE_NOT_STORED);
+      doc.add(body);
+
       id = new Field("id", "", StringField.TYPE_STORED);
       doc.add(id);
 
@@ -475,10 +487,11 @@ public Document nextDoc(DocState doc) throws IOException {
 
     long msecSinceEpoch;
     int timeSec;
-    int spot3;
+    int spot4;
     String line;
     String title;
     String body;
+    String randomLabel;
 
     if (isBinary) {
 
@@ -505,27 +518,37 @@ public Document nextDoc(DocState doc) throws IOException {
         nextDocs.set(lfd);
         //System.out.println("    got new buffer=" + buffer + " pos=" + buffer.position() + " limit=" + buffer.limit());
       }
+      // buffer format described in buildBinaryLineDocs.py
       ByteBuffer buffer = lfd.byteText;
       int titleLenBytes = buffer.getInt();
       int bodyLenBytes = buffer.getInt();
-      //System.out.println("    titleLen=" + titleLenBytes + " bodyLenBytes=" + bodyLenBytes);
-      msecSinceEpoch  = buffer.getLong();
+      int randomLabelLenBytes = buffer.getInt();
       timeSec  = buffer.getInt();
+      msecSinceEpoch  = buffer.getLong();
+//      System.out.println("    titleLen=" + titleLenBytes + " bodyLenBytes=" + bodyLenBytes +
+//              " randomLabelLenBytes=" + randomLabelLenBytes + " msecSinceEpoch=" + msecSinceEpoch + " timeSec=" + timeSec);
       byte[] bytes = buffer.array();
 
       char[] titleChars = new char[titleLenBytes];
       int titleLenChars = UnicodeUtil.UTF8toUTF16(bytes, buffer.position(), titleLenBytes, titleChars);
       title = new String(titleChars, 0, titleLenChars);
-      //System.out.println("title: " + title);
+//      System.out.println("title: " + title);
 
       char[] bodyChars = new char[bodyLenBytes];
       int bodyLenChars = UnicodeUtil.UTF8toUTF16(bytes, buffer.position()+titleLenBytes, bodyLenBytes, bodyChars);
       body = new String(bodyChars, 0, bodyLenChars);
-      buffer.position(buffer.position() + titleLenBytes + bodyLenBytes);
+//      System.out.println("body: " + body);
+
+      char[] randomLabelChars = new char[randomLabelLenBytes];
+      int randomLabelLenChars = UnicodeUtil.UTF8toUTF16(bytes, buffer.position()+titleLenBytes+bodyLenBytes, randomLabelLenBytes, randomLabelChars);
+      randomLabel = new String(randomLabelChars, 0, randomLabelLenChars);
+//      System.out.println("randomLabel: " + randomLabel);
+
+      buffer.position(buffer.position() + titleLenBytes + bodyLenBytes + randomLabelLenBytes);
 
       doc.dateCal.setTimeInMillis(msecSinceEpoch);
 
-      spot3 = 0;
+      spot4 = 0;
       line = null;
 
       if (lfd.vector != null) {
@@ -552,13 +575,21 @@ public Document nextDoc(DocState doc) throws IOException {
       if (spot2 == -1) {
         throw new RuntimeException("line: [" + line + "] is in an invalid format !");
       }
-      spot3 = line.indexOf(SEP, 1 + spot2);
+      int spot3 = line.indexOf(SEP, 1 + spot2);
       if (spot3 == -1) {
-        spot3 = line.length();
+        throw new RuntimeException("line: [" + line + "] is in an invalid format !" +
+                "Your source file (enwiki-20120502-lines-1k.txt) might be out of date." +
+                "Please download an updated version from home.apache.org/~mikemccand");
+      }
+      spot4 = line.indexOf(SEP, 1 + spot3);
+      if (spot4 == -1) {
+        spot4 = line.length();
       }
 
       body = line.substring(1+spot2, spot3);
 
+      randomLabel = line.substring(1+spot3, spot4).strip();
+
       title = line.substring(0, spot);
 
       final String dateString = line.substring(1+spot, spot2);
@@ -582,9 +613,10 @@ public Document nextDoc(DocState doc) throws IOException {
 
     final int myID = nextID.getAndIncrement();
 
-    bytesIndexed.addAndGet(body.length() + title.length());
+    bytesIndexed.addAndGet(body.length() + title.length() + randomLabel.length());
     doc.body.setStringValue(body);
     doc.title.setStringValue(title);
+    doc.randomLabel.setStringValue(randomLabel);
     if (addDVFields) {
       doc.titleBDV.setBytesValue(new BytesRef(title));
       doc.titleDV.setBytesValue(new BytesRef(title));
@@ -640,8 +672,18 @@ public Document nextDoc(DocState doc) throws IOException {
         }
       }
 
+      if (facetFields.containsKey("RandomLabel")) {
+        int flag = facetFields.get("RandomLabel");
+        if ((flag & 1) != 0) {
+          doc2.add(new FacetField("RandomLabel.taxonomy", randomLabel));
+        }
+        if ((flag & 2) != 0) {
+          doc2.add(new SortedSetDocValuesFacetField("RandomLabel.sortedset", randomLabel));
+        }
+      }
+
       if (extraFacetFields != null) {
-        String[] extraValues = line.substring(spot3+1, line.length()).split("\t");
+        String[] extraValues = line.substring(spot4+1, line.length()).split("\t");
 
         for(int i=0;i<extraFacetFields.length;i++) {
           String extraFieldName = extraFacetFields[i];

diff --git a/src/python/buildBinaryLineDocs.py b/src/python/buildBinaryLineDocs.py
@@ -25,29 +25,33 @@ def flush(pending, pendingDocCount, fOut):
       first = False
       if line.startswith('FIELDS_HEADER_INDICATOR'):
         print('skip header')
-        if len(line.strip().split('\t')) != 4:
-          raise RuntimeError('cannot convert line doc files that have more than title, timestamp, text fields: saw header %s' % line.rstrip())
+        if len(line.strip().split('\t')) != 5:
+          raise RuntimeError('cannot convert line doc files that have more than title, timestamp, text fields, random label: saw header %s' % line.rstrip())
         continue
       else:
         print('no header')
     tup = line.split('\t')
-    if len(tup) != 3:
+    if len(tup) != 4:
       raise RuntimeError('got %s' % str(tup))
-    title, date, body = tup
+    for s in tup:
+      if not s.strip():
+        raise RuntimeError('contained empty category' % str(tup))
+    title, date, body, randomLabel = tup
 
     dt = datetime.datetime.strptime(date.replace('.000', ''), '%d-%b-%Y %H:%M:%S')
     msecSinceEpoch = int((dt - epoch).total_seconds() * 1000)
 
     timeSec = dt.hour*3600 + dt.minute * 60 + dt.second
     titleBytes = title.encode('utf-8')
     bodyBytes = body.encode('utf-8')
-    totalLength = len(titleBytes)+len(bodyBytes)+16
+    randomLabelBytes = randomLabel.strip().encode('utf-8')
+    totalLength = len(titleBytes)+len(bodyBytes)+len(randomLabelBytes)+20
     #print('len=%s' % totalLength)
     #print('HERE: %s, offset=%s' % (struct.pack('i', totalLength), fOut.tell()))
-
-    pending.write(struct.pack('iili', len(titleBytes), len(bodyBytes), msecSinceEpoch, timeSec))
+    pending.write(struct.pack('iiiil', len(titleBytes), len(bodyBytes), len(randomLabelBytes), timeSec, msecSinceEpoch))
     pending.write(titleBytes)
     pending.write(bodyBytes)
+    pending.write(randomLabelBytes)
     pendingDocCount += 1
 
     if pending.tell() > 64*1024:

diff --git a/src/python/constants.py b/src/python/constants.py
@@ -28,7 +28,8 @@
 #WIKI_MEDIUM_DOCS_LINE_FILE = '%s/data/enwiki-20100302-pages-articles-lines-1k-shuffled.txt' % BASE_DIR
 
 # wget http://home.apache.org/~mikemccand/enwiki-20120502-lines-1k.txt.lzma
-WIKI_MEDIUM_DOCS_LINE_FILE = '%s/data/enwiki-20120502-lines-1k.txt' % BASE_DIR
+WIKI_MEDIUM_DOC_BIN_LINE_FILE = '%s/data/enwiki-20120502-lines-1k-with-random-label.bin' % BASE_DIR
+WIKI_MEDIUM_DOCS_LINE_FILE = '%s/data/enwiki-20120502-lines-1k-with-random-label.txt' % BASE_DIR
 WIKI_MEDIUM_DOCS_COUNT = 33332620
 
 # Word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip (823MB download; 2.1GB unzipped)

diff --git a/src/python/createLineFileDocsWithRandomLabel.py b/src/python/createLineFileDocsWithRandomLabel.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import os
+import re
+
+from random import randrange
+
+USAGE= """
+Usage: python createLineFileDocsWithRandomLabel.py
+
+"""
+
+ORIGINAL_LINEFILE = 'enwiki-20120502-lines-1k.txt'
+TARGET_LINEFILE = 'enwiki-20120502-lines-1k-with-random-label.txt'
+
+def createLineFileDocsWithRandomLabels(original_file, target_file):
+    if original_file is None:
+        cwd = os.getcwd()
+        parent, base = os.path.split(cwd)
+        data_dir = os.path.join(parent, 'data')
+
+        if not os.path.exists(data_dir):
+            print('download data before running this script')
+            exit()
+
+        original_file = os.path.join(data_dir, ORIGINAL_LINEFILE)
+        target_file = os.path.join(data_dir, TARGET_LINEFILE)
+    else:
+        if not os.path.exist(original_file):
+            print("Recieved invalid path to data")
+            exit()
+
+    with open(original_file, 'r', encoding='ISO-8859-1') as original, open(target_file, 'w', encoding='ISO-8859-1') as out:
+        first_line = True
+        skipped = 0
+        i = 0
+        for line in original:
+            if i % 100000 == 0:
+                print("Converted ", i, " line file docs")
+            line_arr = line.strip().split('\t')
+            if first_line:
+                line_arr.append('RandomLabel')
+                first_line = False
+                line_to_write = '\t'.join(line_arr) + '\n'
+                out.write(line_to_write)
+                continue
+            else:
+                try:
+                    if not isNotEmpty(line_arr[2]):
+                        line_arr.append('EMPTY_LABEL')
+                        random_label = "EMPTY_LABEL"
+                    else:
+                        random_label = chooseRandomLabel(line_arr[2])
+                        if random_label == "EMPTY_BODY":
+                            random_label = "EMPTY_LABEL"
+                            line_arr[2] = "EMPTY_LABEL"
+                except IndexError:
+                    # found a few lines that looked like this: ['Biosensor', '05-APR-2012 04:12:36.000'] with no body
+                    line_arr.append('EMPTY_LABEL')
+                    random_label = 'EMPTY_LABEL'
+                line_arr.append(random_label)
+            first_line = False
+            line_to_write = '\t'.join(line_arr) + '\n'
+            out.write(line_to_write)
+            i += 1
+        print("Indexed ", i, " total documents")
+        print("Skipped ", skipped, " documents")
+
+def chooseRandomLabel(body):
+    body_arr = list(filter(isNotEmpty, re.split(r'\W', body)))
+    if (len(body_arr) == 0):
+        return "EMPTY_BODY"
+    label = None
+    i = 0
+    while not isNotEmpty(label) and i < 5:
+        label = body_arr[randrange(len(body_arr))]
+    if not isNotEmpty(label):
+        return "EMPTY_LABEL"
+    else:
+        return label
+
+def isNotEmpty(str):
+    if str and str.strip():
+        return True
+    return False
+
+if __name__ == '__main__':
+    if '-help' in sys.argv or '--help' in sys.argv:
+        print(USAGE)
+    else:
+        if len(sys.argv) == 3:
+            createLineFileDocsWithRandomLabels(sys.argv[1], sys.argv[2])
+        elif len(sys.argv) == 1:
+            createLineFileDocsWithRandomLabels(None, None)
+        else:
+            print("Invalid arguments")
+            exit()
diff --git a/src/python/example.py b/src/python/example.py
@@ -29,7 +29,9 @@
                                   ('taxonomy:Month', 'Month'),
                                   ('taxonomy:DayOfYear', 'DayOfYear'),
                                   ('sortedset:Month', 'Month'),
-                                  ('sortedset:DayOfYear', 'DayOfYear')))
+                                  ('sortedset:DayOfYear', 'DayOfYear'),
+                                  ('taxonomy:RandomLabel', 'RandomLabel'),
+                                  ('sortedset:RandomLabel', 'RandomLabel')))
 
   #Warning -- Do not break the order of arguments
   #TODO -- Fix the following by using argparser

diff --git a/src/python/nightlyBench.py b/src/python/nightlyBench.py
@@ -1738,6 +1738,8 @@ def writeIndexHTML(searchChartData, days):
   writeOneLine(w, done, 'OrHighMedDayTaxoFacets', 'high-freq medium-freq +dayOfYear taxo facets')
   writeOneLine(w, done, 'AndHighHighDayTaxoFacets', '+high-freq +high-freq +dayOfYear taxo facets')
   writeOneLine(w, done, 'AndHighMedDayTaxoFacets', '+high-freq +medium-freq +dayOfYear taxo facets')
+  writeOneLine(w, done, 'BrowseRandomLabelTaxoFacets', 'Random labels chosen from each doc')
+  writeOneLine(w, done, 'BrowseRandomLabelSSDVFacets', 'Random labels chosen from each doc (doc values)')
 
   w('<br><br><b>Sorting (on TermQuery):</b>')
   writeOneLine(w, done, 'TermDTSort', 'Date/time (long, high cardinality)')
@@ -1811,7 +1813,9 @@ def writeIndexHTML(searchChartData, days):
   'BrowseDateTaxoFacets': 'All hierarchical taxonomy facet counts for last-modified year/month/day',
   'BrowseDayOfYearSSDVFacets': 'All flat sorted-set doc values facet counts for last-modified day-of-year',
   'BrowseMonthSSDVFacets': 'All flat sorted-set doc values facet counts for last-modified month',
-  }
+  'BrowseRandomLabelTaxoFacets': 'All flat taxonomy facet counts for a random word chosen from each doc',
+  'BrowseRandomLabelSSDVFacets' : 'All flat sorted-set doc values counts for a random word chosen from each doc',
+}
 
 def htmlEscape(s):
   return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

diff --git a/tasks/wikimedium.10M.nostopwords.tasks b/tasks/wikimedium.10M.nostopwords.tasks
@@ -14408,6 +14408,8 @@ BrowseMonthTaxoFacets: *:* +facets:Month.taxonomy
 BrowseDayOfYearTaxoFacets: *:* +facets:DayOfYear.taxonomy
 BrowseMonthSSDVFacets: *:* +facets:Month.sortedset
 BrowseDayOfYearSSDVFacets: *:* +facets:DayOfYear.sortedset
+BrowseRandomLabelTaxoFacets: *:* +facets:RandomLabel.taxonomy
+BrowseRandomLabelSSDVFacets: *:* +facets:RandomLabel.sortedset
 
 HighTermDayOfYearSort: dayofyeardvsort//ref # freq=3793973
 HighTermDayOfYearSort: dayofyeardvsort//http # freq=3493581

diff --git a/tasks/wikimedium.10M.tasks b/tasks/wikimedium.10M.tasks
@@ -11230,6 +11230,8 @@ BrowseMonthTaxoFacets: *:* +facets:Month.taxonomy
 BrowseDayOfYearTaxoFacets: *:* +facets:DayOfYear.taxonomy
 BrowseMonthSSDVFacets: *:* +facets:Month.sortedset
 BrowseDayOfYearSSDVFacets: *:* +facets:DayOfYear.sortedset
+BrowseRandomLabelSSDVFacets: *:* +facets:RandomLabel.sortedset
+BrowseRandomLabelTaxoFacets: *:* +facets:RandomLabel.taxonomy
 
 HighTermDayOfYearSort: dayofyeardvsort//ref # freq=3793973
 HighTermDayOfYearSort: dayofyeardvsort//http # freq=3493581