-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathpan09-plagiarism-detection-performance-measures.py
498 lines (427 loc) · 21.2 KB
/
pan09-plagiarism-detection-performance-measures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
#! /usr/bin/python
# Copyright (C) 2009 webis.de. All rights reserved.
"""Plagiarism detection performance measures.
This module implements the measures recall, precision, and granularity
as described by the authors of [1]. The measures can be calculated
macro-averaged and micro-averaged with the respective functions; each
one's parameters are iterables of reference plagiarism cases and
plagiarism detections. The latter are compared with the former in order
to determine how accurate the cases have been detected with respect to
different ideas of detection accuracy. Moreover, the function
plagdet_score combines recall, precision and granularity values to a
single value which allows to rank plagiarism detection algorithms.
The parameters 'cases' and 'detections' both must contain instances of
the Annotation class, a 7-tuple consisting of the following exemplified
values:
>>> Annotation('suspicious-document00001.txt', 10000, 1000, \
... 'source-document00001.txt', 5000, 1100, True)
where the first three values reference a section of text in a
suspicious document by means of char offset and length, and likewise
the following three values reference a section of text in a source
document. The last value specifies whether the annotation is to be
treated as an external detection or as an intrinsic detection. In the
latter case, the precedin values should be set to '', 0, 0, respectively.
Finally, this module contains functions to extract plagiarism
annotations from XML documents which contain tags with a given name
attribute, and with values corresponding to those required by the
Annotation class, e.g., the XML format used in the PAN benchmarking
workshops [2,3].
[1] Martin Potthast, Benno Stein, Alberto Barron-Cedeno, and Paolo Rosso.
An Evaluation Framework for Plagiarism Detection.
In Proceedings of the 23rd International Conference on Computational
Linguistics (COLING 2010), Beijing, China. August 2010. ACL.
[2] Martin Potthast, Benno Stein, Andreas Eiselt, Alberto Barron-Cedeno,
and Paolo Rosso. Overview of the 1st International Competition on
Plagiarism Detection. In Benno Stein, Paolo Rosso, Efstathios
Stamatatos, Moshe Koppel, and Eneko Agirre, editors, SEPLN 2009
Workshop on Uncovering Plagiarism, Authorship, and Social Software
Misuse (PAN 09), pages 1-9, September 2009. CEUR-WS.org.
ISSN 1613-0073.
[3] Martin Potthast, Benno Stein, Andreas Eiselt, Alberto Barron-Cedeno,
and Paolo Rosso. Overview of the 2nd International Benchmarking
Workshop on Plagiarism Detection. In Benno Stein, Paolo Rosso,
Efstathios Stamatatos, and Moshe Koppel, editors, Proceedings of
PAN at CLEF 2010: Uncovering Plagiarism, Authorship, and Social
Software Misuse, September 2010.
"""
from __future__ import division
__author__ = "Martin Potthast"
__email__ = "martin.potthast at uni-weimar dot de"
__version__ = "1.3"
__all__ = ["macro_avg_recall_and_precision", "micro_avg_recall_and_precision",
"granularity", "plagdet_score", "Annotation"]
from collections import namedtuple
import getopt
import glob
import math
from numpy import int8 as npint8
from numpy.ma import zeros, sum as npsum
import os
import sys
import unittest
import xml.dom.minidom
TREF, TOFF, TLEN = 'this_reference', 'this_offset', 'this_length'
SREF, SOFF, SLEN = 'source_reference', 'source_offset', 'source_length'
EXT = 'is_external'
Annotation = namedtuple('Annotation', [TREF, TOFF, TLEN, SREF, SOFF, SLEN, EXT])
TREF, TOFF, TLEN, SREF, SOFF, SLEN, EXT = range(7)
def macro_avg_recall_and_precision(cases, detections):
"""Returns tuple (rec, prec); the macro-averaged recall and precision of the
detections in detecting the plagiarism cases"""
return macro_avg_recall(cases, detections), \
macro_avg_precision(cases, detections)
def micro_avg_recall_and_precision(cases, detections):
"""Returns tuple (rec, prec); the micro-averaged recall and precision of the
detections in detecting the plagiarism cases"""
if len(cases) == 0 and len(detections) == 0:
return 1, 1
if len(cases) == 0 or len(detections) == 0:
return 0, 0
num_plagiarized, num_detected, num_plagiarized_detected = 0, 0, 0 # chars
num_plagiarized += count_chars(cases)
num_detected += count_chars(detections)
detections = true_detections(cases, detections)
num_plagiarized_detected += count_chars(detections)
rec, prec = 0, 0
if num_plagiarized > 0:
rec = num_plagiarized_detected / num_plagiarized
if num_detected > 0:
prec = num_plagiarized_detected / num_detected
return rec, prec
def granularity(cases, detections):
"""Granularity of the detections in detecting the plagiarism cases."""
if len(detections) == 0:
return 1
detections_per_case = list()
case_index = index_annotations(cases)
det_index = index_annotations(detections)
for tref in case_index:
cases, detections = case_index[tref], det_index.get(tref, False)
if not detections: # No detections for document tref.
continue
for case in cases:
num_dets = sum((is_overlapping(case, det) for det in detections))
detections_per_case.append(num_dets)
detected_cases = sum((num_dets > 0 for num_dets in detections_per_case))
if detected_cases == 0:
return 1
return sum(detections_per_case) / detected_cases
def plagdet_score(rec, prec, gran):
"""Combines recall, precision, and granularity to a allow for ranking."""
if (rec == 0 and prec == 0) or prec < 0 or rec < 0 or gran < 1:
return 0
return ((2 * rec * prec) / (rec + prec)) / math.log(1 + gran, 2)
def macro_avg_recall(cases, detections):
"""Recall of the detections in detecting plagiarism cases."""
if len(cases) == 0 and len(detections) == 0:
return 1
elif len(cases) == 0 or len(detections) == 0:
return 0
num_cases, recall_per_case = len(cases), list()
case_index = index_annotations(cases)
det_index = index_annotations(detections)
for tref in case_index:
cases, detections = case_index[tref], det_index.get(tref, False)
if not detections: # No detections for document tref.
continue
for case in cases:
recall_per_case.append(case_recall(case, detections))
return sum(recall_per_case) / num_cases
def case_recall(case, detections):
"""Recall of the detections in detecting the plagiarism case."""
num_detected_plagiarized = overlapping_chars(case, detections)
num_plagiarized = case[TLEN] + case[SLEN]
return num_detected_plagiarized / num_plagiarized
def macro_avg_precision(cases, detections):
"""Precision of the detections in detecting the plagiarism cases."""
# Observe the difference to calling 'macro_avg_recall(cases, detections)'.
return macro_avg_recall(detections, cases)
def true_detections(cases, detections):
"""Recreates the detections so that only true detections remain and so that
the true detections are reduced to the passages that actually overlap
with the respective detected case."""
true_dets = list()
case_index = index_annotations(cases)
det_index = index_annotations(detections)
for tref in case_index:
cases, detections = case_index[tref], det_index.get(tref, False)
if not detections: # No detections for document tref.
continue
for case in cases:
case_dets = (det for det in detections if is_overlapping(case, det))
true_case_dets = (overlap_annotation(case, det) for det in case_dets)
true_dets.extend(true_case_dets)
return true_dets
def overlap_annotation(ann1, ann2):
"""Returns an Annotation that annotates overlaps between ann1 and ann2."""
tref, sref, ext = ann1[TREF], ann1[SREF], ann1[EXT] and ann2[EXT]
toff, tlen, soff, slen = 0, 0, 0, 0
if is_overlapping(ann1, ann2):
toff, tlen = overlap_chars(ann1, ann2, TOFF, TLEN)
if ext:
soff, slen = overlap_chars(ann1, ann2, SOFF, SLEN)
return Annotation(tref, toff, tlen, sref, soff, slen, ext)
def overlap_chars(ann1, ann2, xoff, xlen):
"""Returns the overlapping passage between ann1 and ann2, given the keys
xoff and xlen."""
overlap_start, overlap_length = 0, 0
max_ann = ann1 if ann1[xoff] >= ann2[xoff] else ann2
min_ann = ann1 if ann1[xoff] < ann2[xoff] else ann2
if min_ann[xoff] + min_ann[xlen] > max_ann[xoff]:
overlap_start = max_ann[xoff]
overlap_end = min(min_ann[xoff] + min_ann[xlen], \
max_ann[xoff] + max_ann[xlen])
overlap_length = overlap_end - overlap_start
return overlap_start, overlap_length
def count_chars(annotations):
"""Returns the number of chars covered by the annotations, while counting
overlapping chars only once."""
num_chars = count_chars2(annotations, TREF, TOFF, TLEN)
num_chars += count_chars2(annotations, SREF, SOFF, SLEN)
return num_chars
def count_chars2(annotations, xref, xoff, xlen):
"""Returns the number of cvhars covered by the annotations with regard to
the keys xref, xoff, and xlen."""
num_chars = 0
max_length = max((ann[xoff] + ann[xlen] for ann in annotations))
char_bits = zeros(max_length, dtype=bool)
xref_index = index_annotations(annotations, xref)
for xref in xref_index:
annotations = xref_index[xref]
char_bits[:] = False
for ann in annotations:
char_bits[ann[xoff]:ann[xoff] + ann[xlen]] = True
num_chars += npsum(char_bits)
return num_chars
def overlapping_chars(ann1, annotations):
"""Returns the number of chars in ann1 that overlap with the annotations."""
annotations = [ann2 for ann2 in annotations if is_overlapping(ann1, ann2)]
if len(annotations) == 0 or not isinstance(ann1, Annotation):
return 0
this_overlaps = zeros(ann1[TLEN], dtype=bool)
source_overlaps = zeros(ann1[SLEN], dtype=bool)
for ann2 in annotations:
mark_overlapping_chars(this_overlaps, ann1, ann2, TOFF, TLEN)
mark_overlapping_chars(source_overlaps, ann1, ann2, SOFF, SLEN)
return npsum(this_overlaps) + npsum(source_overlaps)
def mark_overlapping_chars(char_bits, ann1, ann2, xoff, xlen):
"""Sets the i-th boolean in char_bits to true if ann2 overlaps with the i-th
char in ann1, respecting the given xoff and xlen index."""
offset_difference = ann2[xoff] - ann1[xoff]
overlap_start = min(max(0, offset_difference), ann1[xlen])
overlap_end = min(max(0, offset_difference + ann2[xlen]), ann1[xlen])
char_bits[overlap_start:overlap_end] = True
def is_overlapping(ann1, ann2):
"""Returns true iff the ann2 overlaps with ann1."""
detected = ann1[TREF] == ann2[TREF] and \
ann2[TOFF] + ann2[TLEN] > ann1[TOFF] and \
ann2[TOFF] < ann1[TOFF] + ann1[TLEN]
if ann1[EXT] == True and ann2[EXT] == True:
detected = detected and ann1[SREF] == ann2[SREF] and \
ann2[SOFF] + ann2[SLEN] > ann1[SOFF] and \
ann2[SOFF] < ann1[SOFF] + ann1[SLEN]
return detected
def index_annotations(annotations, xref=TREF):
"""Returns an inverted index that maps references to annotation lists."""
index = dict()
for ann in annotations:
index.setdefault(ann[xref], []).append(ann)
return index
def extract_annotations_from_files(path, tagname):
"""Returns a set of plagiarism annotations from XML files below path."""
if not os.path.exists(path):
print "Path not accessible:", path
sys.exit(2)
annotations = set()
xmlfiles = glob.glob(os.path.join(path, '*.xml'))
xmlfiles.extend(glob.glob(os.path.join(path, os.path.join('*', '*.xml'))))
for xmlfile in xmlfiles:
annotations.update(extract_annotations_from_file(xmlfile, tagname))
return annotations
def extract_annotations_from_file(xmlfile, tagname):
"""Returns a set of plagiarism annotations from an XML file."""
doc = xml.dom.minidom.parse(xmlfile)
annotations = set()
if not doc.documentElement.hasAttribute('reference'):
return annotations
t_ref = doc.documentElement.getAttribute('reference')
for node in doc.documentElement.childNodes:
if node.nodeType == xml.dom.Node.ELEMENT_NODE and \
node.hasAttribute('name') and \
node.getAttribute('name').endswith(tagname):
ann = extract_annotation_from_node(node, t_ref)
if ann:
annotations.add(ann)
return annotations
def extract_annotation_from_node(xmlnode, t_ref):
"""Returns a plagiarism annotation from an XML feature tag node."""
if not (xmlnode.hasAttribute('this_offset') and \
xmlnode.hasAttribute('this_length')):
return False
t_off = int(xmlnode.getAttribute('this_offset'))
t_len = int(xmlnode.getAttribute('this_length'))
s_ref, s_off, s_len, ext = '', 0, 0, False
if xmlnode.hasAttribute('source_reference') and \
xmlnode.hasAttribute('source_offset') and \
xmlnode.hasAttribute('source_length'):
s_ref = xmlnode.getAttribute('source_reference')
s_off = int(xmlnode.getAttribute('source_offset'))
s_len = int(xmlnode.getAttribute('source_length'))
ext = True
return Annotation(t_ref, t_off, t_len, s_ref, s_off, s_len, ext)
class TestPerfMeasures(unittest.TestCase):
"""Unit tests for the plagiarism detection performance measures."""
ann1 = Annotation('tref1', 0, 100, 'sref1', 0, 100, True)
ann2 = Annotation('tref1', 0, 100, '', 0, 0, False)
ann3 = Annotation('tref1', 100, 100, 'sref1', 100, 100, True)
ann4 = Annotation('tref1', 0, 200, 'sref1', 0, 200, True)
ann5 = Annotation('tref1', 0, 1, 'sref1', 0, 1, True)
ann6 = Annotation('tref1', 99, 1, 'sref1', 99, 1, True)
ann7 = Annotation('tref2', 0, 100, 'sref2', 0, 100, True)
ann8 = Annotation('tref2', 0, 100, '', 0, 0, False)
ann9 = Annotation('tref2', 50, 100, 'sref2', 50, 100, True)
ann10 = Annotation('tref2', 25, 75, 'sref2', 25, 75, True)
def test_macro_averaged_recall(self):
self.assertEqual(1, macro_avg_recall([], []))
self.assertEqual(0, macro_avg_recall(['sth'], []))
self.assertEqual(0, macro_avg_recall([], ['sth']))
self.assertEqual(1, macro_avg_recall([self.ann1], [self.ann1]))
self.assertEqual(1, macro_avg_recall([self.ann2], [self.ann2]))
self.assertEqual(0.5, macro_avg_recall([self.ann1, self.ann7], \
[self.ann1]))
self.assertEqual(0.5, macro_avg_recall([self.ann2, self.ann8], \
[self.ann2]))
self.assertEqual(0, macro_avg_recall([self.ann1], [self.ann7]))
self.assertEqual(0, macro_avg_recall([self.ann2], [self.ann8]))
def test_case_recall(self):
self.assertEqual(0, case_recall(self.ann1, []))
self.assertEqual(1, case_recall(self.ann1, [self.ann1]))
self.assertEqual(0.5, case_recall(self.ann1, [self.ann2]))
self.assertEqual(0, case_recall(self.ann1, [self.ann3]))
self.assertEqual(1, case_recall(self.ann1, [self.ann4]))
self.assertEqual(1, case_recall(self.ann1, [self.ann4, self.ann7]))
self.assertEqual(0, case_recall(self.ann1, [self.ann7, self.ann9]))
self.assertEqual(0.5, case_recall(self.ann7, [self.ann9]))
self.assertEqual(0.75, case_recall(self.ann7, [self.ann10]))
self.assertEqual(0.75, case_recall(self.ann7, [self.ann9, self.ann10]))
def test_macro_averaged_precision(self):
self.assertEqual(1, macro_avg_precision([], []))
self.assertEqual(0, macro_avg_precision(['sth'], []))
self.assertEqual(0, macro_avg_precision([], ['sth']))
self.assertEqual(1, macro_avg_precision([self.ann1], [self.ann1]))
self.assertEqual(1, macro_avg_precision([self.ann2], [self.ann2]))
self.assertEqual(1, macro_avg_precision([self.ann1, self.ann7], \
[self.ann1]))
self.assertEqual(1, macro_avg_precision([self.ann2, self.ann8], \
[self.ann2]))
self.assertEqual(0.5, macro_avg_precision([self.ann1], [self.ann4]))
self.assertEqual(1, macro_avg_precision([self.ann7], [self.ann10]))
self.assertEqual(1, macro_avg_precision([self.ann7], [self.ann10]))
self.assertEqual(0.75, macro_avg_precision([self.ann7], \
[self.ann9, self.ann10]))
self.assertEqual(0.25, macro_avg_precision([self.ann1], \
[self.ann3, self.ann4]))
def test_granularity(self):
self.assertEqual(1, granularity([], []))
self.assertEqual(1, granularity([self.ann1], [self.ann2]))
self.assertEqual(1, granularity([self.ann1], [self.ann2, self.ann3]))
self.assertEqual(2, granularity([self.ann1],
[self.ann2, self.ann3, self.ann4]))
self.assertEqual(1.5, granularity([self.ann1, self.ann3],
[self.ann2, self.ann4]))
def test_plagdet_score(self):
self.assertEqual(0, plagdet_score(-1, 0, 0))
self.assertEqual(0, plagdet_score(0, -1, 0))
self.assertEqual(0, plagdet_score(0, 0, -1))
self.assertEqual(0, plagdet_score(0, 0, 1))
self.assertEqual(0, plagdet_score(0, 1, 1))
self.assertEqual(0, plagdet_score(1, 0, 1))
self.assertEqual(1, plagdet_score(1, 1, 1))
self.assertEqual(2 / 3, plagdet_score(0.5, 1, 1))
self.assertEqual(2 / 3, plagdet_score(1, 0.5, 1))
self.assertAlmostEqual(0.63092975, plagdet_score(1, 1, 2))
self.assertAlmostEqual(0.23659865, plagdet_score(0.25, 0.75, 2))
def test_is_overlapping(self):
self.assertTrue(is_overlapping(self.ann1, self.ann2))
self.assertFalse(is_overlapping(self.ann1, self.ann3))
self.assertTrue(is_overlapping(self.ann1, self.ann4))
self.assertFalse(is_overlapping(self.ann1, self.ann7))
self.assertFalse(is_overlapping(self.ann1, self.ann8))
self.assertFalse(is_overlapping(self.ann1, self.ann9))
self.assertFalse(is_overlapping(self.ann1, self.ann10))
self.assertTrue(is_overlapping(self.ann1, self.ann5))
self.assertTrue(is_overlapping(self.ann1, self.ann6))
def test_index_annotations(self):
index = index_annotations([self.ann1, self.ann7, self.ann2, self.ann8])
self.assertEqual([self.ann1, self.ann2], index.get('tref1'))
self.assertEqual([self.ann7, self.ann8], index.get('tref2'))
def usage():
"""Prints command line usage manual."""
print """\
Usage: pan09-plagiarism-detection-performance-measures.py [options]
Options:
--micro Compute micro-averaged recall and precision,
default: macro-averaged recall and precision
-p, --plag-path Path to the XML files with plagiarism annotations
--plag-tag Tag name suffix of plagiarism annotations,
default: 'plagiarism'
-d, --det-path Path to the XML files with detection annotations
--det-tag Tag name of the detection annotations,
default: 'detected-plagiarism'
-h, --help Show this message
"""
def parse_options():
"""Parses the command line options."""
try:
long_options = ["micro", "plag-path=", "plag-tag=", "det-path=",
"det-tag=", "help"]
opts, _ = getopt.getopt(sys.argv[1:], "p:d:h", long_options)
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
micro_averaged = False
plag_path, det_path = "undefined", "undefined"
plag_tag_name, det_tag_name = "plagiarism", "detected-plagiarism"
for opt, arg in opts:
if opt in ("--micro"):
micro_averaged = True
elif opt in ("-p", "--plag-path"):
plag_path = arg
elif opt == "--plag-tag":
plag_tag_name = arg
elif opt in ("-d", "--det-path"):
det_path = arg
elif opt == "--det-tag":
det_tag_name = arg
elif opt in ("-h", "--help"):
usage()
sys.exit()
else:
assert False, "Unknown option."
if plag_path == "undefined":
print "Plagiarism path undefined. Use option -p or --plag-path."
sys.exit()
if det_path == "undefined":
print "Detections path undefined. Use option -d or --det-path."
sys.exit()
return (micro_averaged, plag_path, plag_tag_name, det_path, det_tag_name)
def main(micro_averaged, plag_path, plag_tag_name, det_path, det_tag_name):
"""Main method of this module."""
print 'Reading', plag_path
cases = extract_annotations_from_files(plag_path, plag_tag_name)
print 'Reading', det_path
detections = extract_annotations_from_files(det_path, det_tag_name)
print 'Processing... (this may take a while)'
rec, prec = 0, 0
if micro_averaged:
rec, prec = micro_avg_recall_and_precision(cases, detections)
else:
rec, prec = macro_avg_recall_and_precision(cases, detections)
gran = granularity(cases, detections)
print 'Plagdet Score', plagdet_score(rec, prec, gran)
print 'Recall', rec
print 'Precision', prec
print 'Granularity', gran
if __name__ == '__main__':
main(*parse_options())