diff --git a/.gitignore b/.gitignore index 3e759b7..9aefe51 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,12 @@ ## ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore +# Output from tokenizer and DuplicateCodeDetector +output/ +DuplicateCodeDetector/DuplicateCodeDetector.csproj.json +DuplicateCodeDetector/DuplicateCodeDetector.csproj.log +deduplicated_results + # User-specific files *.suo *.user diff --git a/README.md b/README.md index 398cf2f..e1b2111 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,38 @@ This cross-platform sample tool detects exact and near duplicates of code maintained by the [Deep Program Understanding](https://www.microsoft.com/en-us/research/project/program/) group in Microsoft Research, Cambridge, UK. It has been created for the purpose of deduplicating code corpora for research purposes. -*Requirements*: .NET Core 2.1 or higher. For parsing code, an appropriate runtime for each of the languages that needs to be tokenized is also required. +*Requirements*: +* .NET Core 2.1 for parsing code, an appropriate runtime for each of the languages that needs to be tokenized is also required. +* Java 1.8 for tokenizing Java code +* Python for removing found duplicates from code. +### Duplicate Detection and Removal +Duplicate removal consists of tokenizing the code, detecting duplicates, copying the dataset and then +removing the duplicates from copy. This results with a deduplicated copy and untouched original dataset. +A convenient shell script is provided for this, just run: +NOTE: Works only for JAVA +``` +sh deduplicate.sh target/project/path output/path/ +``` +* After script finishes, you can find deduplicated dataset under `output/path/` +You can optionally skip the argument and specify the path in the shell script by changing +`DEFAULT_TARGET_PROJECT_PATH` value. Same goes for output path. + +### Running Tokenizer +For java, run: +`java -jar tokenizers/java/target/javatokenizer-1.0-SNAPSHOT.jar /path/to/target/project/ ./output true` +The last boolean is for granularity: +* true - look only at identifier tokens +* false - look at identifier names + all tokens, including things like ";" and operators like "<", "||", etc. + +### Duplicate Detection To run the near-duplicate detection run: ``` -$ dotnet run /path/to/DuplicateCodeDetector.csproj [options] --dir= +$ dotnet run /path/to/DuplicateCodeDetector.csproj --project /path/to/DuplicateCodeDetector/ --dir= ``` -This will use all the `.gz` files in the `` and output an `.json` with the groups of detected duplicates. Invoke `--help` for more options. +This will use all the `.gz` files in the `` generated by tokenizer and output +`DuplicateCodeDetector/CloneDetectorCli.json` file with the groups of detected duplicates. +Invoke `--help` for more options. ### Input Data @@ -24,6 +49,10 @@ Alternative formats can be accepted by providing the `--tokens-field` and `--id- The `tokenizers` folder in this repository contains tokenizers for C\#, Java, JavaScript and Python. Please, feel free to contribute tokenizers for other languages too. +### Duplicate Removal +Once code is tokenized and clones are detected, a removal script can be run. +`python deduplicate.py --project project/to/deduplicate --duplicates_data data/generated/by/duplicate/detection` + # Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/deduplicate.py b/deduplicate.py new file mode 100644 index 0000000..8792329 --- /dev/null +++ b/deduplicate.py @@ -0,0 +1,27 @@ +import json +import os +import random +from argparse import ArgumentParser + +''' +This script removes duplicate files from project found by duplicate code detector. +''' +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument("--project", dest="project_path", + help="path to the project from which duplicates should be removed", required=True) + parser.add_argument("--duplicates_data", dest="duplicates_data_path", + help="data from DuplicateCodeDetector", required=True) + args = parser.parse_args() + + project_path = args.project_path + duplicates_data_path = args.duplicates_data_path + + with open('DuplicateCodeDetector/DuplicateCodeDetector.csproj.json') as f: + duplicates = json.load(f) + + for duplicate_group in duplicates: # type: list + # Leave one from the duplicate group to the dataset + duplicate_group.remove(random.choice(duplicate_group)) + for duplicate_path in duplicate_group: # type: str + os.remove(os.path.join(project_path, duplicate_path)) diff --git a/deduplicate.sh b/deduplicate.sh new file mode 100644 index 0000000..e427f7a --- /dev/null +++ b/deduplicate.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +############################################################################### +# Bash script to take dataset, find duplicates and create +# a copy of the dataset without near code duplicates +# +# Usage: sh deduplicate.sh target/project/path output/folder/path +# if target/project/path is not specified, script falls +# back to DEFAULT_TARGET_PROJECT_PATH. Same for output. +############################################################################### +# Change the following values to preprocess a new dataset. +# PATH_TO_TOKENIZER - Path to the tokenizer JAR +# DEFAULT_TARGET_PROJECT_PATH - Path to target project if +# not specified by parameters +# TOKENIZER_OUTPUT_PATH - Output for the tokenizer +# IDENTIFIER_ONLY - Boolean to specify if tokenizing only +# identifiers or all possible text in code +# +# DUPLICATE_DETECTOR_PROJECT_PATH - Path to DuplicateCodeDetector project +# DUPLICATE_DETECTOR_PATH - Path to the DuplicateCodeDetector c# entry file +# +# DEDUPLICATE_PROJECT_PATH - Path for the resulting deduplicated project +# DEDUPLICATION_DATA - Path to temporarily save deduplication data as JSON form +# +# JAVA - java 1.8 alias +# DOTNET - dotnet alias +# PYTHON - python3 interpreter alias. +############################################################################### +# Changing DEFAULT_TARGET_PROJECT_PATH or specifing it +# in program argument is enough for most users. +DEFAULT_TARGET_PROJECT_PATH="path/to/project/if/not/specified/in/parameters" +#${nr:-value} used to parse arguments in order of entry or fall back to "value" +TARGET_PROJECT_PATH=${1:-${DEFAULT_TARGET_PROJECT_PATH}} +############################################################################### +PATH_TO_TOKENIZER="tokenizers/java/target/javatokenizer-1.0-SNAPSHOT.jar" +TOKENIZER_OUTPUT_PATH="output/" +IDENTIFIER_ONLY="true" + +DUPLICATE_DETECTOR_PROJECT_PATH="DuplicateCodeDetector" +DUPLICATE_DETECTOR_PATH="${DUPLICATE_DETECTOR_PROJECT_PATH}/DuplicateCodeDetector.csproj" + +DEFAULT_DEDUPLICATE_PROJECT_PATH="deduplicated_results" +DEDUPLICATE_PROJECT_PATH=${2:-${DEFAULT_DEDUPLICATE_PROJECT_PATH}} +DEDUPLICATION_DATA="${DUPLICATE_DETECTOR_PROJECT_PATH}/DuplicateCodeDetector.csproj.json" + +JAVA=java +DOTNET=dotnet +PYTHON=python + +rm -rf ${DEDUPLICATE_PROJECT_PATH} + +echo "Running tokenizer..." +${JAVA} -jar ${PATH_TO_TOKENIZER} ${TARGET_PROJECT_PATH} ${TOKENIZER_OUTPUT_PATH} ${IDENTIFIER_ONLY} +echo "Tokenizer finished." + +echo "Running near duplicate code detection..." +${DOTNET} run ${DUPLICATE_DETECTOR_PATH} --project=${DUPLICATE_DETECTOR_PROJECT_PATH} --dir=${TOKENIZER_OUTPUT_PATH} +echo "Near duplicate code detection finished." + +echo "Copying project to ${DEDUPLICATE_PROJECT_PATH}" +cp -r ${TARGET_PROJECT_PATH}/. ${DEDUPLICATE_PROJECT_PATH} +echo "Copying finished" + +echo "Removing duplicates from the copy" +${PYTHON} deduplicate.py --project ${DEDUPLICATE_PROJECT_PATH} --duplicates_data ${DEDUPLICATION_DATA} +echo "Finished removing near duplicates" +echo "Untouched project location: ${TARGET_PROJECT_PATH}" +echo "Resulting project with duplicates removed: ${DEDUPLICATE_PROJECT_PATH}" + +# If all went well, tokenizer output is not needed anymore +rm -r ${TOKENIZER_OUTPUT_PATH} \ No newline at end of file diff --git a/tokenizers/java/.gitignore b/tokenizers/java/.gitignore new file mode 100644 index 0000000..1de5659 --- /dev/null +++ b/tokenizers/java/.gitignore @@ -0,0 +1 @@ +target \ No newline at end of file diff --git a/tokenizers/java/pom.xml b/tokenizers/java/pom.xml index e0fe5f5..eac92ac 100644 --- a/tokenizers/java/pom.xml +++ b/tokenizers/java/pom.xml @@ -17,7 +17,7 @@ UTF-8 - + @@ -37,4 +37,23 @@ 2.8.5 + + + + + maven-assembly-plugin + + + + javatokenizer.Extractor + + + + jar-with-dependencies + + false + + + + \ No newline at end of file diff --git a/tokenizers/java/src/main/java/javatokenizer/Extractor.java b/tokenizers/java/src/main/java/javatokenizer/Extractor.java index 3852ba3..ac035dd 100644 --- a/tokenizers/java/src/main/java/javatokenizer/Extractor.java +++ b/tokenizers/java/src/main/java/javatokenizer/Extractor.java @@ -51,6 +51,12 @@ public boolean accept(File current, String name) { public static void ExtractForFolder(File projectFolder, File outputFolder, boolean onlyIdentifiers, File baseFolder) { Iterator allFiles = FileUtils.iterateFiles(projectFolder, new String[] {"java"}, true); try { + String outputFileString = Paths.get(outputFolder.toPath().toString(), projectFolder.getName() + ".jsonl.gz").toString(); + File outputFile = new File(outputFileString); + // Make parent directory if it doesn't exist + outputFile.getParentFile().mkdirs(); + // Make file if it doesn't exist + outputFile.createNewFile(); FileOutputStream output = new FileOutputStream(Paths.get(outputFolder.toPath().toString(), projectFolder.getName() + ".jsonl.gz").toFile()); Gson gson = new GsonBuilder().create(); diff --git a/tokenizers/java/target/javatokenizer-1.0-SNAPSHOT.jar b/tokenizers/java/target/javatokenizer-1.0-SNAPSHOT.jar new file mode 100644 index 0000000..a530fe8 Binary files /dev/null and b/tokenizers/java/target/javatokenizer-1.0-SNAPSHOT.jar differ