Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve code quality of token sequence normalization #1872

Merged
merged 8 commits into from
Jul 31, 2024
3 changes: 2 additions & 1 deletion core/src/main/java/de/jplag/JPlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ public static JPlagResult run(JPlagOptions options) throws ExitException {
SubmissionSetBuilder builder = new SubmissionSetBuilder(options);
SubmissionSet submissionSet = builder.buildSubmissionSet();
if (options.normalize() && options.language().supportsNormalization() && options.language().requiresCoreNormalization()) {
submissionSet.normalizeSubmissions();
boolean normalizeOrder = !options.mergingOptions().enabled(); // match merging conflicts with sorting
submissionSet.normalizeSubmissions(normalizeOrder);
}
int submissionCount = submissionSet.numberOfSubmissions();
if (submissionCount < 2) {
Expand Down
7 changes: 4 additions & 3 deletions core/src/main/java/de/jplag/Submission.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import org.slf4j.LoggerFactory;

import de.jplag.exceptions.LanguageException;
import de.jplag.normalization.TokenStringNormalizer;
import de.jplag.normalization.TokenSequenceNormalizer;
import de.jplag.options.JPlagOptions;

/**
Expand Down Expand Up @@ -256,10 +256,11 @@ private static File createErrorDirectory(String... subdirectoryNames) {
/**
* Perform token sequence normalization, which makes the token sequence invariant to dead code insertion and independent
* statement reordering.
* @param sorting determines whether to perform topological sorting during normalization.
*/
void normalize() {
void normalize(boolean sorting) {
List<Integer> originalOrder = getOrder(tokenList);
tokenList = TokenStringNormalizer.normalize(tokenList);
tokenList = TokenSequenceNormalizer.normalize(tokenList, sorting);
List<Integer> normalizedOrder = getOrder(tokenList);

logger.debug("original line order: {}", originalOrder);
Expand Down
12 changes: 9 additions & 3 deletions core/src/main/java/de/jplag/SubmissionSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,17 @@ public List<Submission> getInvalidSubmissions() {
return invalidSubmissions;
}

public void normalizeSubmissions() {
/**
* Normalizes the token sequences of all submissions (including basecode). This makes the token sequence invariant to
* dead code insertion and independent statement reordering by removing dead tokens and optionally reordering tokens to
* a deterministic order.
* @param sorting determines whether to perform topological sorting during normalization.
*/
public void normalizeSubmissions(boolean sorting) {
if (baseCodeSubmission != null) {
baseCodeSubmission.normalize();
baseCodeSubmission.normalize(sorting);
}
ProgressBarLogger.iterate(ProgressBarType.TOKEN_STRING_NORMALIZATION, submissions, Submission::normalize);
ProgressBarLogger.iterate(ProgressBarType.TOKEN_STRING_NORMALIZATION, submissions, submission -> submission.normalize(sorting));
}

private List<Submission> filterValidSubmissions() {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package de.jplag.normalization;

import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
Expand All @@ -16,24 +15,33 @@
/**
* Performs token sequence normalization.
*/
public class TokenStringNormalizer {
public final class TokenSequenceNormalizer {

private TokenStringNormalizer() {
private TokenSequenceNormalizer() {
// private constructor for non-instantiability.
}

/**
* Performs token sequence normalization. Tokens representing dead code have been eliminated and tokens representing
* subsequent independent statements have been put in a fixed order. Works by first constructing a Normalization Graph
* and then turning it back into a token sequence.
* subsequent independent statements have been put in a fixed order if sorting is true. Works by first constructing a
* Normalization Graph and then turning it back into a token sequence.
* @param tokens The original token sequence, remains unaltered.
* @return The normalized token sequence as unmodifiable list.
* @param sorting Boolean flag to control if the tokens should be topologically sorted.
* @return The normalized token sequence.
*/
public static List<Token> normalize(List<Token> tokens) {
public static List<Token> normalize(List<Token> tokens, boolean sorting) {
SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph = new NormalizationGraphConstructor(tokens).get();
propagateKeepStatus(normalizationGraph);
if (sorting) {
return normalizeWithSorting(tokens, normalizationGraph);
tsaglam marked this conversation as resolved.
Show resolved Hide resolved
}
return normalizeWithoutSorting(normalizationGraph, tokens);
}

// Add tokens in normalized original order, removing dead tokens
private static List<Token> normalizeWithSorting(List<Token> tokens, SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph) {
List<Token> normalizedTokens = new ArrayList<>(tokens.size());
spreadKeep(normalizationGraph);
PriorityQueue<Statement> roots = normalizationGraph.vertexSet().stream() //
.filter(v -> !Graphs.vertexHasPredecessors(normalizationGraph, v)) //
PriorityQueue<Statement> roots = normalizationGraph.vertexSet().stream().filter(v -> !Graphs.vertexHasPredecessors(normalizationGraph, v))
.collect(Collectors.toCollection(PriorityQueue::new));
while (!roots.isEmpty()) {
PriorityQueue<Statement> newRoots = new PriorityQueue<>();
Expand All @@ -51,13 +59,24 @@ public static List<Token> normalize(List<Token> tokens) {
} while (!roots.isEmpty());
roots = newRoots;
}
return Collections.unmodifiableList(normalizedTokens);
return normalizedTokens;
}

// Add tokens in the original order, removing dead tokens
private static List<Token> normalizeWithoutSorting(SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph, List<Token> tokens) {
List<Token> normalizedTokens = new ArrayList<>(tokens.size());
for (Statement statement : normalizationGraph.vertexSet()) {
if (statement.semantics().keep()) {
normalizedTokens.addAll(statement.tokens());
}
}
return normalizedTokens;
}

/**
* Spread keep status to every node that does not represent dead code. Nodes without keep status are later eliminated.
*/
private static void spreadKeep(SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph) {
private static void propagateKeepStatus(SimpleDirectedGraph<Statement, MultipleEdge> normalizationGraph) {
Queue<Statement> visit = new LinkedList<>(normalizationGraph.vertexSet().stream() //
.filter(tl -> tl.semantics().keep()).toList());
while (!visit.isEmpty()) {
Expand Down
34 changes: 24 additions & 10 deletions core/src/test/java/de/jplag/NormalizationTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,51 @@
import java.util.stream.Collectors;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import de.jplag.exceptions.ExitException;
import de.jplag.options.JPlagOptions;

class NormalizationTest extends TestBase {
private final Map<String, List<TokenType>> tokenStringMap;
private final List<TokenType> originalTokenString;
private Map<String, List<TokenType>> tokenStringMap;
private List<TokenType> originalTokenString;
private SubmissionSet submissionSet;

NormalizationTest() throws ExitException {
@BeforeEach
void setUp() throws ExitException {
JPlagOptions options = getDefaultOptions("normalization");
SubmissionSetBuilder builder = new SubmissionSetBuilder(options);
SubmissionSet submissionSet = builder.buildSubmissionSet();
submissionSet.normalizeSubmissions();
submissionSet = builder.buildSubmissionSet();
}

// normalize submission set and initialize fields
private void normalizeSubmissions(boolean sorting) {
submissionSet.normalizeSubmissions(sorting);
Function<Submission, List<TokenType>> getTokenString = submission -> submission.getTokenList().stream().map(Token::getType).toList();
tokenStringMap = submissionSet.getSubmissions().stream().collect(Collectors.toMap(Submission::getName, getTokenString));
originalTokenString = tokenStringMap.get("Squares.java");
}

@Test
void testInsertionNormalization() {
@ParameterizedTest
@ValueSource(booleans = {true, false})
void testInsertionNormalization(boolean sorting) {
normalizeSubmissions(sorting);
Assertions.assertIterableEquals(originalTokenString, tokenStringMap.get("SquaresInserted.java"));
}

@Test
void testReorderingNormalization() {
Assertions.assertIterableEquals(originalTokenString, tokenStringMap.get("SquaresReordered.java"));
@ParameterizedTest
@ValueSource(booleans = {true, false})
void testReorderingNormalization(boolean sorting) {
normalizeSubmissions(sorting);
Assertions.assertEquals(sorting, originalTokenString.equals(tokenStringMap.get("SquaresReordered.java")));
}

@Test
void testInsertionReorderingNormalization() {
normalizeSubmissions(true);
Assertions.assertIterableEquals(originalTokenString, tokenStringMap.get("SquaresInsertedReordered.java"));
}
}