Skip to content

Commit

Permalink
Added facet benchmark that chooses random words as labels (mikemccand…
Browse files Browse the repository at this point in the history
  • Loading branch information
mdmarshmallow authored Nov 17, 2021
1 parent 0550148 commit fec91aa
Show file tree
Hide file tree
Showing 11 changed files with 203 additions and 26 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
*.ipr
*.iws

## Java flight recording
*.jfr

## project files
.project
.classpath
Expand Down
74 changes: 58 additions & 16 deletions src/main/perf/LineFileDocs.java
Original file line number Diff line number Diff line change
Expand Up @@ -215,28 +215,35 @@ private void open() throws IOException {
reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
String firstLine = reader.readLine();
if (firstLine.startsWith("FIELDS_HEADER_INDICATOR")) {
if (!firstLine.startsWith("FIELDS_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody") &&
!firstLine.startsWith("FIELDS_HEADER_INDICATOR###\ttitle\ttimestamp\ttext")) {
int defaultFieldLength = 4;
if (firstLine.startsWith("FIELDS_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody") == false &&
firstLine.startsWith("FIELDS_HEADER_INDICATOR###\ttitle\ttimestamp\ttext") == false &&
firstLine.startsWith("FIELD_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody\tRandomLabel") == false) {
throw new IllegalArgumentException("unrecognized header in line docs file: " + firstLine.trim());
}
if (firstLine.startsWith("FIELDS_HEADER_INDICATOR###\tdoctitle\tdocdate\tbody\tRandomLabel")) {
defaultFieldLength = 5;
}
if (facetFields.isEmpty() == false) {
String[] fields = firstLine.split("\t");
if (fields.length > 4) {
extraFacetFields = Arrays.copyOfRange(fields, 4, fields.length);
if (fields.length > defaultFieldLength) {
extraFacetFields = Arrays.copyOfRange(fields, defaultFieldLength, fields.length);
System.out.println("Additional facet fields: " + Arrays.toString(extraFacetFields));

List<String> extraFacetFieldsList = Arrays.asList(extraFacetFields);

// Verify facet fields now:
for(String field : facetFields.keySet()) {
if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false && !extraFacetFieldsList.contains(field)) {
if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false
&& field.equals("RandomLabel") == false && extraFacetFieldsList.contains(field) == false) {
throw new IllegalArgumentException("facet field \"" + field + "\" is not recognized");
}
}
} else {
// Verify facet fields now:
for(String field : facetFields.keySet()) {
if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false) {
if (field.equals("Date") == false && field.equals("Month") == false && field.equals("DayOfYear") == false
&& field.equals("RandomLabel") == false) {
throw new IllegalArgumentException("facet field \"" + field + "\" is not recognized");
}
}
Expand Down Expand Up @@ -339,6 +346,8 @@ public static final class DocState {
final Field id;
final Field idPoint;
final Field date;
final Field randomLabel;

//final NumericDocValuesField dateMSec;
//final LongField rand;
final Field timeSec;
Expand Down Expand Up @@ -409,6 +418,9 @@ public static final class DocState {
body = new Field("body", "", bodyFieldType);
doc.add(body);

randomLabel = new Field("randomLabel", "", StringField.TYPE_NOT_STORED);
doc.add(body);

id = new Field("id", "", StringField.TYPE_STORED);
doc.add(id);

Expand Down Expand Up @@ -475,10 +487,11 @@ public Document nextDoc(DocState doc) throws IOException {

long msecSinceEpoch;
int timeSec;
int spot3;
int spot4;
String line;
String title;
String body;
String randomLabel;

if (isBinary) {

Expand All @@ -505,27 +518,37 @@ public Document nextDoc(DocState doc) throws IOException {
nextDocs.set(lfd);
//System.out.println(" got new buffer=" + buffer + " pos=" + buffer.position() + " limit=" + buffer.limit());
}
// buffer format described in buildBinaryLineDocs.py
ByteBuffer buffer = lfd.byteText;
int titleLenBytes = buffer.getInt();
int bodyLenBytes = buffer.getInt();
//System.out.println(" titleLen=" + titleLenBytes + " bodyLenBytes=" + bodyLenBytes);
msecSinceEpoch = buffer.getLong();
int randomLabelLenBytes = buffer.getInt();
timeSec = buffer.getInt();
msecSinceEpoch = buffer.getLong();
// System.out.println(" titleLen=" + titleLenBytes + " bodyLenBytes=" + bodyLenBytes +
// " randomLabelLenBytes=" + randomLabelLenBytes + " msecSinceEpoch=" + msecSinceEpoch + " timeSec=" + timeSec);
byte[] bytes = buffer.array();

char[] titleChars = new char[titleLenBytes];
int titleLenChars = UnicodeUtil.UTF8toUTF16(bytes, buffer.position(), titleLenBytes, titleChars);
title = new String(titleChars, 0, titleLenChars);
//System.out.println("title: " + title);
// System.out.println("title: " + title);

char[] bodyChars = new char[bodyLenBytes];
int bodyLenChars = UnicodeUtil.UTF8toUTF16(bytes, buffer.position()+titleLenBytes, bodyLenBytes, bodyChars);
body = new String(bodyChars, 0, bodyLenChars);
buffer.position(buffer.position() + titleLenBytes + bodyLenBytes);
// System.out.println("body: " + body);

char[] randomLabelChars = new char[randomLabelLenBytes];
int randomLabelLenChars = UnicodeUtil.UTF8toUTF16(bytes, buffer.position()+titleLenBytes+bodyLenBytes, randomLabelLenBytes, randomLabelChars);
randomLabel = new String(randomLabelChars, 0, randomLabelLenChars);
// System.out.println("randomLabel: " + randomLabel);

buffer.position(buffer.position() + titleLenBytes + bodyLenBytes + randomLabelLenBytes);

doc.dateCal.setTimeInMillis(msecSinceEpoch);

spot3 = 0;
spot4 = 0;
line = null;

if (lfd.vector != null) {
Expand All @@ -552,13 +575,21 @@ public Document nextDoc(DocState doc) throws IOException {
if (spot2 == -1) {
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
}
spot3 = line.indexOf(SEP, 1 + spot2);
int spot3 = line.indexOf(SEP, 1 + spot2);
if (spot3 == -1) {
spot3 = line.length();
throw new RuntimeException("line: [" + line + "] is in an invalid format !" +
"Your source file (enwiki-20120502-lines-1k.txt) might be out of date." +
"Please download an updated version from home.apache.org/~mikemccand");
}
spot4 = line.indexOf(SEP, 1 + spot3);
if (spot4 == -1) {
spot4 = line.length();
}

body = line.substring(1+spot2, spot3);

randomLabel = line.substring(1+spot3, spot4).strip();

title = line.substring(0, spot);

final String dateString = line.substring(1+spot, spot2);
Expand All @@ -582,9 +613,10 @@ public Document nextDoc(DocState doc) throws IOException {

final int myID = nextID.getAndIncrement();

bytesIndexed.addAndGet(body.length() + title.length());
bytesIndexed.addAndGet(body.length() + title.length() + randomLabel.length());
doc.body.setStringValue(body);
doc.title.setStringValue(title);
doc.randomLabel.setStringValue(randomLabel);
if (addDVFields) {
doc.titleBDV.setBytesValue(new BytesRef(title));
doc.titleDV.setBytesValue(new BytesRef(title));
Expand Down Expand Up @@ -640,8 +672,18 @@ public Document nextDoc(DocState doc) throws IOException {
}
}

if (facetFields.containsKey("RandomLabel")) {
int flag = facetFields.get("RandomLabel");
if ((flag & 1) != 0) {
doc2.add(new FacetField("RandomLabel.taxonomy", randomLabel));
}
if ((flag & 2) != 0) {
doc2.add(new SortedSetDocValuesFacetField("RandomLabel.sortedset", randomLabel));
}
}

if (extraFacetFields != null) {
String[] extraValues = line.substring(spot3+1, line.length()).split("\t");
String[] extraValues = line.substring(spot4+1, line.length()).split("\t");

for(int i=0;i<extraFacetFields.length;i++) {
String extraFieldName = extraFacetFields[i];
Expand Down
18 changes: 11 additions & 7 deletions src/python/buildBinaryLineDocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,33 @@ def flush(pending, pendingDocCount, fOut):
first = False
if line.startswith('FIELDS_HEADER_INDICATOR'):
print('skip header')
if len(line.strip().split('\t')) != 4:
raise RuntimeError('cannot convert line doc files that have more than title, timestamp, text fields: saw header %s' % line.rstrip())
if len(line.strip().split('\t')) != 5:
raise RuntimeError('cannot convert line doc files that have more than title, timestamp, text fields, random label: saw header %s' % line.rstrip())
continue
else:
print('no header')
tup = line.split('\t')
if len(tup) != 3:
if len(tup) != 4:
raise RuntimeError('got %s' % str(tup))
title, date, body = tup
for s in tup:
if not s.strip():
raise RuntimeError('contained empty category' % str(tup))
title, date, body, randomLabel = tup

dt = datetime.datetime.strptime(date.replace('.000', ''), '%d-%b-%Y %H:%M:%S')
msecSinceEpoch = int((dt - epoch).total_seconds() * 1000)

timeSec = dt.hour*3600 + dt.minute * 60 + dt.second
titleBytes = title.encode('utf-8')
bodyBytes = body.encode('utf-8')
totalLength = len(titleBytes)+len(bodyBytes)+16
randomLabelBytes = randomLabel.strip().encode('utf-8')
totalLength = len(titleBytes)+len(bodyBytes)+len(randomLabelBytes)+20
#print('len=%s' % totalLength)
#print('HERE: %s, offset=%s' % (struct.pack('i', totalLength), fOut.tell()))

pending.write(struct.pack('iili', len(titleBytes), len(bodyBytes), msecSinceEpoch, timeSec))
pending.write(struct.pack('iiiil', len(titleBytes), len(bodyBytes), len(randomLabelBytes), timeSec, msecSinceEpoch))
pending.write(titleBytes)
pending.write(bodyBytes)
pending.write(randomLabelBytes)
pendingDocCount += 1

if pending.tell() > 64*1024:
Expand Down
3 changes: 2 additions & 1 deletion src/python/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
#WIKI_MEDIUM_DOCS_LINE_FILE = '%s/data/enwiki-20100302-pages-articles-lines-1k-shuffled.txt' % BASE_DIR

# wget http://home.apache.org/~mikemccand/enwiki-20120502-lines-1k.txt.lzma
WIKI_MEDIUM_DOCS_LINE_FILE = '%s/data/enwiki-20120502-lines-1k.txt' % BASE_DIR
WIKI_MEDIUM_DOC_BIN_LINE_FILE = '%s/data/enwiki-20120502-lines-1k-with-random-label.bin' % BASE_DIR
WIKI_MEDIUM_DOCS_LINE_FILE = '%s/data/enwiki-20120502-lines-1k-with-random-label.txt' % BASE_DIR
WIKI_MEDIUM_DOCS_COUNT = 33332620

# Word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip (823MB download; 2.1GB unzipped)
Expand Down
113 changes: 113 additions & 0 deletions src/python/createLineFileDocsWithRandomLabel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env python

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os
import re

from random import randrange

USAGE= """
Usage: python createLineFileDocsWithRandomLabel.py
"""

ORIGINAL_LINEFILE = 'enwiki-20120502-lines-1k.txt'
TARGET_LINEFILE = 'enwiki-20120502-lines-1k-with-random-label.txt'

def createLineFileDocsWithRandomLabels(original_file, target_file):
if original_file is None:
cwd = os.getcwd()
parent, base = os.path.split(cwd)
data_dir = os.path.join(parent, 'data')

if not os.path.exists(data_dir):
print('download data before running this script')
exit()

original_file = os.path.join(data_dir, ORIGINAL_LINEFILE)
target_file = os.path.join(data_dir, TARGET_LINEFILE)
else:
if not os.path.exist(original_file):
print("Recieved invalid path to data")
exit()

with open(original_file, 'r', encoding='ISO-8859-1') as original, open(target_file, 'w', encoding='ISO-8859-1') as out:
first_line = True
skipped = 0
i = 0
for line in original:
if i % 100000 == 0:
print("Converted ", i, " line file docs")
line_arr = line.strip().split('\t')
if first_line:
line_arr.append('RandomLabel')
first_line = False
line_to_write = '\t'.join(line_arr) + '\n'
out.write(line_to_write)
continue
else:
try:
if not isNotEmpty(line_arr[2]):
line_arr.append('EMPTY_LABEL')
random_label = "EMPTY_LABEL"
else:
random_label = chooseRandomLabel(line_arr[2])
if random_label == "EMPTY_BODY":
random_label = "EMPTY_LABEL"
line_arr[2] = "EMPTY_LABEL"
except IndexError:
# found a few lines that looked like this: ['Biosensor', '05-APR-2012 04:12:36.000'] with no body
line_arr.append('EMPTY_LABEL')
random_label = 'EMPTY_LABEL'
line_arr.append(random_label)
first_line = False
line_to_write = '\t'.join(line_arr) + '\n'
out.write(line_to_write)
i += 1
print("Indexed ", i, " total documents")
print("Skipped ", skipped, " documents")

def chooseRandomLabel(body):
body_arr = list(filter(isNotEmpty, re.split(r'\W', body)))
if (len(body_arr) == 0):
return "EMPTY_BODY"
label = None
i = 0
while not isNotEmpty(label) and i < 5:
label = body_arr[randrange(len(body_arr))]
if not isNotEmpty(label):
return "EMPTY_LABEL"
else:
return label

def isNotEmpty(str):
if str and str.strip():
return True
return False

if __name__ == '__main__':
if '-help' in sys.argv or '--help' in sys.argv:
print(USAGE)
else:
if len(sys.argv) == 3:
createLineFileDocsWithRandomLabels(sys.argv[1], sys.argv[2])
elif len(sys.argv) == 1:
createLineFileDocsWithRandomLabels(None, None)
else:
print("Invalid arguments")
exit()
4 changes: 3 additions & 1 deletion src/python/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
('taxonomy:Month', 'Month'),
('taxonomy:DayOfYear', 'DayOfYear'),
('sortedset:Month', 'Month'),
('sortedset:DayOfYear', 'DayOfYear')))
('sortedset:DayOfYear', 'DayOfYear'),
('taxonomy:RandomLabel', 'RandomLabel'),
('sortedset:RandomLabel', 'RandomLabel')))

#Warning -- Do not break the order of arguments
#TODO -- Fix the following by using argparser
Expand Down
6 changes: 5 additions & 1 deletion src/python/nightlyBench.py
Original file line number Diff line number Diff line change
Expand Up @@ -1738,6 +1738,8 @@ def writeIndexHTML(searchChartData, days):
writeOneLine(w, done, 'OrHighMedDayTaxoFacets', 'high-freq medium-freq +dayOfYear taxo facets')
writeOneLine(w, done, 'AndHighHighDayTaxoFacets', '+high-freq +high-freq +dayOfYear taxo facets')
writeOneLine(w, done, 'AndHighMedDayTaxoFacets', '+high-freq +medium-freq +dayOfYear taxo facets')
writeOneLine(w, done, 'BrowseRandomLabelTaxoFacets', 'Random labels chosen from each doc')
writeOneLine(w, done, 'BrowseRandomLabelSSDVFacets', 'Random labels chosen from each doc (doc values)')

w('<br><br><b>Sorting (on TermQuery):</b>')
writeOneLine(w, done, 'TermDTSort', 'Date/time (long, high cardinality)')
Expand Down Expand Up @@ -1811,7 +1813,9 @@ def writeIndexHTML(searchChartData, days):
'BrowseDateTaxoFacets': 'All hierarchical taxonomy facet counts for last-modified year/month/day',
'BrowseDayOfYearSSDVFacets': 'All flat sorted-set doc values facet counts for last-modified day-of-year',
'BrowseMonthSSDVFacets': 'All flat sorted-set doc values facet counts for last-modified month',
}
'BrowseRandomLabelTaxoFacets': 'All flat taxonomy facet counts for a random word chosen from each doc',
'BrowseRandomLabelSSDVFacets' : 'All flat sorted-set doc values counts for a random word chosen from each doc',
}

def htmlEscape(s):
return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
Expand Down
2 changes: 2 additions & 0 deletions tasks/wikimedium.10M.nostopwords.tasks
Original file line number Diff line number Diff line change
Expand Up @@ -14408,6 +14408,8 @@ BrowseMonthTaxoFacets: *:* +facets:Month.taxonomy
BrowseDayOfYearTaxoFacets: *:* +facets:DayOfYear.taxonomy
BrowseMonthSSDVFacets: *:* +facets:Month.sortedset
BrowseDayOfYearSSDVFacets: *:* +facets:DayOfYear.sortedset
BrowseRandomLabelTaxoFacets: *:* +facets:RandomLabel.taxonomy
BrowseRandomLabelSSDVFacets: *:* +facets:RandomLabel.sortedset

HighTermDayOfYearSort: dayofyeardvsort//ref # freq=3793973
HighTermDayOfYearSort: dayofyeardvsort//http # freq=3493581
Expand Down
2 changes: 2 additions & 0 deletions tasks/wikimedium.10M.tasks
Original file line number Diff line number Diff line change
Expand Up @@ -11230,6 +11230,8 @@ BrowseMonthTaxoFacets: *:* +facets:Month.taxonomy
BrowseDayOfYearTaxoFacets: *:* +facets:DayOfYear.taxonomy
BrowseMonthSSDVFacets: *:* +facets:Month.sortedset
BrowseDayOfYearSSDVFacets: *:* +facets:DayOfYear.sortedset
BrowseRandomLabelSSDVFacets: *:* +facets:RandomLabel.sortedset
BrowseRandomLabelTaxoFacets: *:* +facets:RandomLabel.taxonomy

HighTermDayOfYearSort: dayofyeardvsort//ref # freq=3793973
HighTermDayOfYearSort: dayofyeardvsort//http # freq=3493581
Expand Down
Loading

0 comments on commit fec91aa

Please sign in to comment.