Skip to content

Commit

Permalink
[SYSTEMDS-3547] Additional transpose in-place kernel (Brenner)
Browse files Browse the repository at this point in the history
DIA WiSe 24/25 project
Closes #2199.
  • Loading branch information
jessicapriebe authored and mboehm7 committed Feb 1, 2025
1 parent c66b39f commit 0b5fae9
Show file tree
Hide file tree
Showing 2 changed files with 360 additions and 13 deletions.
263 changes: 250 additions & 13 deletions src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixReorg.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ public class LibMatrixReorg {

//use csr instead of mcsr sparse block for rexpand columns / diag v2m
public static final boolean SPARSE_OUTPUTS_IN_CSR = true;

//use legacy in-place transpose dense instead of brenner in-place transpose dense
public static final boolean TRANSPOSE_IN_PLACE_DENSE_LEGACY = true;

private enum ReorgType {
TRANSPOSE,
Expand Down Expand Up @@ -1789,19 +1792,23 @@ else if(cols == rows){
transposeInPlaceTrivial(in.getDenseBlockValues(), cols, k);
}
else {
if(cols<rows){
// important to set dims after
c2r(in, k);
values.setDims(new int[]{rows,cols});
in.setNumColumns(cols);
in.setNumRows(rows);
}
else{
// important to set dims before
values.setDims(new int[]{rows,cols});
in.setNumColumns(cols);
in.setNumRows(rows);
r2c(in, k);
if(TRANSPOSE_IN_PLACE_DENSE_LEGACY) {
if(cols < rows) {
// important to set dims after
c2r(in, k);
values.setDims(new int[] {rows, cols});
in.setNumColumns(cols);
in.setNumRows(rows);
}
else {
// important to set dims before
values.setDims(new int[] {rows, cols});
in.setNumColumns(cols);
in.setNumRows(rows);
r2c(in, k);
}
} else {
transposeInPlaceDenseBrenner(in, k);
}
}

Expand Down Expand Up @@ -4227,4 +4234,234 @@ public Object call() {
return null;
}
}

/**
* Transposes a dense matrix in-place using following cycles based on Brenner's method. This
* method shifts cycles with a focus on less storage by using cycle leaders based on prime factorization. The used
* storage is in O(n+m). Quadratic matrices should be handled outside this method (using the trivial method) for a
* speedup. This method is based on: Algorithm 467, Brenner, https://dl.acm.org/doi/pdf/10.1145/355611.362542.
*
* @param in The input matrix to be transposed.
* @param k The number of threads.
*/
public static void transposeInPlaceDenseBrenner(MatrixBlock in, int k) {

DenseBlock denseBlock = in.getDenseBlock();
double[] matrix = in.getDenseBlockValues();

final int rows = in.getNumRows();
final int cols = in.getNumColumns();

// Brenner: rows + cols / 2 is sufficient for most cases.
int workSize = rows + cols;
int maxIndex = rows * cols - 1;

// prime factorization of the maximum index to identify cycle structures
// Brenner: length 8 is sufficient up to maxIndex 2*3*5*...*19 = 9,767,520.
int[] primes = new int[8];
int[] exponents = new int[8];
int[] powers = new int[8];
int numPrimes = primeFactorization(maxIndex, primes, exponents, powers);

int[] iExponents = new int[numPrimes];
int div = 1;

div:
while(div < maxIndex / 2) {

// number of indices divisible by div and no other divisor of maxIndex
int count = eulerTotient(primes, exponents, iExponents, numPrimes, maxIndex / div);
// all false
boolean[] moved = new boolean[workSize];
// starting point cycle
int start = div;

count:
do {
// companion of start
int comp = maxIndex - start;

if(start == div) {
// shift cycles
count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
start += div;
}
else if(start < workSize && moved[start]) {
// already moved
start += div;
}
else {
// handle other cycle starts
int cycleLeader = start / div;
for(int ip = 0; ip < numPrimes; ip++) {
if(iExponents[ip] != exponents[ip] && cycleLeader % primes[ip] == 0) {
start += div;
continue count;
}
}

if(start < workSize) {
count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
start += div;
continue;
}

int test = start;
do {
test = prevIndexCycle(test, rows, cols);
if(test < start || test > comp) {
start += div;
continue count;
}
}
while(test > start && test < comp);

count = simultaneousCycleShift(matrix, moved, rows, maxIndex, count, workSize, start, comp);
start += div;
}
}
while(count > 0);

// update cycle divisor for the next set of cycles based on prime factors
for(int ip = 0; ip < numPrimes; ip++) {
if(iExponents[ip] != exponents[ip]) {
iExponents[ip]++;
div *= primes[ip];
continue div;
}
iExponents[ip] = 0;
div /= powers[ip];
}
}

denseBlock.setDims(new int[] {cols, rows});
in.setNumColumns(rows);
in.setNumRows(cols);
}

/**
* Performs a simultaneous cycle shift for a cycle and its companion cycle. This method ensures that distinct cycles
* or self-dual cycles are handled correctly. This method is based on: Algorithm 2, Karlsson,
* https://webapps.cs.umu.se/uminf/reports/2009/011/part1.pdf and Algorithm 467, Brenner,
* https://dl.acm.org/doi/pdf/10.1145/355611.362542.
*
* @param matrix The matrix whose elements are being shifted.
* @param moved Boolean array tracking whether an element has already been moved.
* @param rows The number of rows in the matrix.
* @param maxIndex The maximum valid index in the matrix.
* @param count The number of elements left to process.
* @param workSize The length of moved.
* @param start The starting index for the cycle shift.
* @param comp The corresponding companion index.
* @return The updated count of elements remaining to shift.
*/
private static int simultaneousCycleShift(double[] matrix, boolean[] moved, int rows, int maxIndex, int count,
int workSize, int start, int comp) {

int orig = start;
double val = matrix[orig];
double cval = matrix[comp];

while(orig >= 0) {
// decrease the remaining shift count by orig and comp
count -= 2;
orig = simultaneousCycleShiftStep(matrix, moved, rows, maxIndex, workSize, start, orig, val, cval);
}
return count;
}

private static int simultaneousCycleShiftStep(double[] matrix, boolean[] moved, int rows, int maxIndex,
int workSize, int start, int orig, double val, double cval) {

int comp = maxIndex - orig;
int prevOrig = prevIndexCycle(orig, rows, (maxIndex + 1) / rows);
int prevComp = maxIndex - prevOrig;

if(orig < workSize)
moved[orig] = true;
if(comp < workSize)
moved[comp] = true;

if(prevOrig == start) {
// cycle and comp are distinct
matrix[orig] = val;
matrix[comp] = cval;
return -1;
}
else if(prevComp == start) {
// cycle is self dual
matrix[orig] = cval;
matrix[comp] = val;
return -1;
}

// shift the values to their next positions
matrix[orig] = matrix[prevOrig];
matrix[comp] = matrix[prevComp];
// update
return prevOrig;
}

private static int prevIndexCycle(int index, int rows, int cols) {
int lastIndex = rows * cols - 1;
if(index == lastIndex)
return lastIndex;
long temp = (long) index * cols;
return (int) (temp % lastIndex);
}

/**
* Performs prime factorization of a given number n. The method calculates the prime factors of n, their exponents,
* powers and stores the results in the provided arrays.
*
* @param n The number to be factorized.
* @param primes Array to store the unique prime factors of n.
* @param exponents Array to store the exponents of the respective prime factors.
* @param powers Array to store the powers of the respective prime factors.
* @return The number of unique prime factors.
*/
private static int primeFactorization(int n, int[] primes, int[] exponents, int[] powers) {
int pIdx = -1;
int currDiv = 0;
int rest = n;
int div = 2;

while(rest > 1) {
int quotient = rest / div;
if(rest - div * quotient == 0) {
if(div == currDiv) {
// current divisor is the same as the last one
powers[pIdx] *= div;
exponents[pIdx]++;
}
else {
// new prime factor found
pIdx++;
if(pIdx >= primes.length)
throw new RuntimeException("Not enough space, need to expand input arrays.");

primes[pIdx] = div;
powers[pIdx] = div;
currDiv = div;
exponents[pIdx] = 1;
}
rest = quotient;
}
else {
// only odd divs
div = (div == 2) ? 3 : div + 2;
}
}
return pIdx + 1;
}

private static int eulerTotient(int[] primes, int[] exponents, int[] iExponents, int numPrimes, int count) {
for(int ip = 0; ip < numPrimes; ip++) {
if(iExponents[ip] == exponents[ip]) {
continue;
}
count = (count / primes[ip]) * (primes[ip] - 1);
}
return count;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.test.component.matrix.libMatrixReorg;

import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
import org.apache.sysds.runtime.matrix.data.MatrixBlock;
import org.apache.sysds.test.TestUtils;
import org.junit.Test;

import static org.junit.Assert.assertThrows;
import static org.junit.Assert.assertTrue;

public class TransposeInPlaceBrennerTest {

@Test
public void transposeInPlaceDenseBrennerOnePrime() {
// 3*4-1 = 11
testTransposeInPlaceDense(3, 4, 1);
}

@Test
public void transposeInPlaceDenseBrennerTwoPrimes() {
// 4*9-1 = 5*7
testTransposeInPlaceDense(4, 9, 0.96);
}

@Test
public void transposeInPlaceDenseBrennerThreePrimes() {
// 2*53-1 = 3*5*7
testTransposeInPlaceDense(2, 53, 0.52);
}

@Test
public void transposeInPlaceDenseBrennerThreePrimesOneExpo() {
// 1151*2999-1 = (2**3)*3*143827
testTransposeInPlaceDense(1151, 2999, 0.82);
}

@Test
public void transposeInPlaceDenseBrennerThreePrimesAllExpos() {
// 9*10889-1 = (2**4)*(5**3)*(7**2)
testTransposeInPlaceDense(9, 10889, 0.74);
}

@Test
public void transposeInPlaceDenseBrennerFourPrimesOneExpo() {
// 53*4421-1 = (2**3)*3*13*751
testTransposeInPlaceDense(53, 4421, 0.75);
}

@Test
public void transposeInPlaceDenseBrennerFivePrimes() {
// 3*3337-1 = 2*5*7*11*13
testTransposeInPlaceDense(3, 3337, 0.68);
}

@Test
public void transposeInPlaceDenseBrennerSixPrimesOneExpo() {
// 53*7177-1 = (2**2)*5*7*11*13*19
testTransposeInPlaceDense(53, 7177, 0.78);
}

@Test
public void transposeInPlaceDenseBrennerSevenPrimesThreeExpos() {
// 2087*17123-1 = (2**2)*3*(5**2)*(7**2)*11*13*17
testTransposeInPlaceDense(2087, 17123, 0.79);
}

@Test
public void transposeInPlaceDenseBrennerEightPrimes() {
// 347*27953-1 = 2*3*5*7*11*13*17*19
testTransposeInPlaceDense(347, 27953, 0.86);
}

@Test
public void transposeInPlaceDenseBrennerNinePrimes() {
// 317*703763-1 = 2*3*5*7*11*13*17*19*23
MatrixBlock X = MatrixBlock.randOperations(317, 703763, 0.52);

Exception exception = assertThrows(RuntimeException.class,
() -> LibMatrixReorg.transposeInPlaceDenseBrenner(X, 1));
assertTrue(exception.getMessage().contains("Not enough space, need to expand input arrays."));
}

private void testTransposeInPlaceDense(int rows, int cols, double sparsity) {
MatrixBlock X = MatrixBlock.randOperations(rows, cols, sparsity);
MatrixBlock tX = LibMatrixReorg.transpose(X);

LibMatrixReorg.transposeInPlaceDenseBrenner(X, 1);

TestUtils.compareMatrices(X, tX, 0);
}
}

0 comments on commit 0b5fae9

Please sign in to comment.