Skip to content

Commit

Permalink
[SYSTEMDS-3652] Adding support for removing duplicate rows to the uni…
Browse files Browse the repository at this point in the history
…que() builtin function (#1949)

This patch adds support for removing duplicate rows to the unique() builtin function. Currently, only CP is supported- support for SPARK will be added in a subsequent patch. Furthermore, we we will explore possible optimizations of the naive implementation using bitmaps in a subsequent patch.
  • Loading branch information
BACtaki authored Dec 5, 2023
1 parent 8355392 commit 5e132c0
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@
import org.apache.commons.lang3.NotImplementedException;
import org.apache.sysds.common.Types;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;

public class LibMatrixSketch {

public static MatrixBlock getUniqueValues(MatrixBlock blkIn, Types.Direction dir) {
//similar to R's unique, this operation takes a matrix and computes the
//unique values (or rows in case of multiple column inputs)
// similar to R's unique, this operation takes a matrix and computes the unique values
// (or rows in case of multiple column inputs)

int rlen = blkIn.getNumRows();
int clen = blkIn.getNumColumns();
Expand Down Expand Up @@ -57,6 +58,63 @@ public static MatrixBlock getUniqueValues(MatrixBlock blkIn, Types.Direction dir
break;

case Row:
ArrayList<double[]> retainedRows = new ArrayList<>();

for (int i=0; i<rlen; ++i) {

// BitSet will not work because we need 2 pieces of info:
// 1. the index and
// 2. the value
// A BitSet gives us only whether there is a value at a particular index, but not what that
// specific value is.

double[] currentRow = new double[clen];
for (int j=0; j<clen; ++j) {
double rawValue = blkIn.getValue(i, j);
currentRow[j] = rawValue;
}

// no need to check for duplicates for the first row
if (i == 0) {
retainedRows.add(currentRow);
continue;
}

// ensure we are not adding duplicate rows to retainedRows array
int uniqueRowCount = 0;
for (int m=0; m<retainedRows.size(); ++m) {

double[] prevRow = retainedRows.get(m);

int n = 0;
while (n < clen) {
if (prevRow[n] != currentRow[n]) {
break;
}
n++;
}

// column check terminates early only if there is a column-level mismatch, ie rows are different
if (n != clen) {
uniqueRowCount++;
}
}

// add current row to retainedRows iff it is unique from all prev retained rows
if (uniqueRowCount == retainedRows.size()) {
retainedRows.add(currentRow);
}
}

blkOut = new MatrixBlock(retainedRows.size(), blkIn.getNumColumns(), false);
for (int i=0; i<retainedRows.size(); ++i) {
for (int j=0; j<blkIn.getNumColumns(); ++j) {
blkOut.quickSetValue(i, j, retainedRows.get(i)[j]);
}
}

break;

case Col:
throw new NotImplementedException("Unique Row/Col has not been implemented yet");

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.test.functions.unique;

import org.apache.sysds.common.Types;
import org.junit.Test;

public class UniqueRow extends UniqueBase {
private final static String TEST_NAME = "uniqueRow";
private final static String TEST_DIR = "functions/unique/";
private static final String TEST_CLASS_DIR = TEST_DIR + UniqueRow.class.getSimpleName() + "/";


@Override
protected String getTestName() {
return TEST_NAME;
}

@Override
protected String getTestDir() {
return TEST_DIR;
}

@Override
protected String getTestClassDir() {
return TEST_CLASS_DIR;
}

@Test
public void testBaseCaseCP() {
double[][] inputMatrix = {{0}};
double[][] expectedMatrix = {{0}};
uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
}

@Test
public void testSkinnyCP() {
double[][] inputMatrix = {{1},{1},{6},{9},{4},{2},{0},{9},{0},{0},{4},{4}};
double[][] expectedMatrix = {{1},{6},{9},{4},{2},{0}};
uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
}

@Test
public void testSquareCP() {
double[][] inputMatrix = {{1, 2, 3}, {4, 5, 6}, {1, 2, 3}};
double[][] expectedMatrix = {{1, 2, 3},{4, 5, 6}};
uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
}

@Test
public void testWideCP() {
double[][] inputMatrix = {{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}, {1, 2, 3, 4, 5, 6}};
double[][] expectedMatrix = {{1, 2, 3, 4, 5, 6}, {7, 8, 9, 10, 11, 12}};
uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
}

@Test
public void testNoDuplicatesCP() {
double[][] inputMatrix = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
double[][] expectedMatrix = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
uniqueTest(inputMatrix, expectedMatrix, Types.ExecType.CP, 0.0);
}
}
24 changes: 24 additions & 0 deletions src/test/scripts/functions/unique/uniqueRow.dml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

input = read($1);
res = unique(input, dir="r");
write(res, $2, format="text");

0 comments on commit 5e132c0

Please sign in to comment.