Skip to content

Commit 3a6e285

Browse files
author
Nabil Miri
committed
Add Hamming Distance knn similarity metric for long property
+ tests
1 parent 255e450 commit 3a6e285

11 files changed

+219
-17
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.similarity.knn.metrics;
21+
22+
/**
23+
* We compute the Hamming Distance,
24+
* (https://en.wikipedia.org/wiki/Hamming_distance) and turn it into
25+
* a similarity metric by clamping into 0..1 range using a linear
26+
* transformation.
27+
*/
28+
public final class HammingDistance {
29+
private HammingDistance() {}
30+
31+
public static double longMetric(long left, long right) {
32+
return normalizeBitCount(
33+
Long.bitCount(left ^ right)
34+
);
35+
}
36+
37+
/**
38+
* We use unity-based normalization to scale the bit
39+
* count to the [0-1] range:
40+
* y = (x_i - min(x)) / (max(x) - min(x)) See
41+
* https://stats.stackexchange.com/a/70807 for example.
42+
* In our case, min(x) = 0 since you cannot have a negative
43+
* bit count, and max(x) = 64 since in Java, a long is
44+
* 64 bits in size.
45+
*
46+
* We then subtract the normalized range from 1.0 to map
47+
* 1.0 as most similar, and 0.0 as least similar.
48+
*/
49+
private static double normalizeBitCount(long bitCount) {
50+
return 1.0 - (bitCount / 64.0);
51+
}
52+
}

algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/LongPropertySimilarityComputer.java

+4-6
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,20 @@
2424

2525
final class LongPropertySimilarityComputer implements SimilarityComputer {
2626
private final NodePropertyValues nodePropertyValues;
27+
private final LongPropertySimilarityMetric metric;
2728

28-
LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues) {
29+
LongPropertySimilarityComputer(NodePropertyValues nodePropertyValues, LongPropertySimilarityMetric metric) {
2930
if (nodePropertyValues.valueType() != ValueType.LONG) {
3031
throw new IllegalArgumentException("The property is not of type LONG");
3132
}
3233
this.nodePropertyValues = nodePropertyValues;
34+
this.metric = metric;
3335
}
3436

3537
@Override
3638
public double similarity(long firstNodeId, long secondNodeId) {
3739
var left = nodePropertyValues.longValue(firstNodeId);
3840
var right = nodePropertyValues.longValue(secondNodeId);
39-
var abs = Math.abs(left - right);
40-
if (abs == Long.MIN_VALUE) {
41-
abs = Long.MAX_VALUE;
42-
}
43-
return 1.0 / (1.0 + abs);
41+
return metric.compute(left, right);
4442
}
4543
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.similarity.knn.metrics;
21+
interface LongPropertySimilarityMetric {
22+
double compute(long left, long right);
23+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.similarity.knn.metrics;
21+
22+
public final class NormalizedAbsoluteDifference {
23+
private NormalizedAbsoluteDifference() {}
24+
25+
public static double longMetric(long left, long right) {
26+
var abs = Math.abs(left - right);
27+
if (abs == Long.MIN_VALUE) {
28+
abs = Long.MAX_VALUE;
29+
}
30+
return 1.0 / (1.0 + abs);
31+
}
32+
}

algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputer.java

+14-3
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,11 @@ static SimilarityComputer ofProperty(
7373
) {
7474
switch (properties.valueType()) {
7575
case LONG:
76-
return ofLongProperty(properties);
76+
return ofLongProperty(
77+
name,
78+
properties,
79+
defaultSimilarityMetric
80+
);
7781
case DOUBLE:
7882
return ofDoubleProperty(properties);
7983
case DOUBLE_ARRAY:
@@ -107,8 +111,15 @@ static SimilarityComputer ofDoubleProperty(NodePropertyValues nodePropertyValues
107111
return new DoublePropertySimilarityComputer(nodePropertyValues);
108112
}
109113

110-
static SimilarityComputer ofLongProperty(NodePropertyValues nodePropertyValues) {
111-
return new LongPropertySimilarityComputer(nodePropertyValues);
114+
static SimilarityComputer ofLongProperty(String name, NodePropertyValues properties, SimilarityMetric metric) {
115+
switch (metric) {
116+
case HAMMING_DISTANCE:
117+
return new LongPropertySimilarityComputer(properties, HammingDistance::longMetric);
118+
case NORMALIZED_ABSOLUTE_DIFFERENCE:
119+
return new LongPropertySimilarityComputer(properties, NormalizedAbsoluteDifference::longMetric);
120+
default:
121+
throw unsupportedSimilarityMetric(name, properties.valueType(), metric);
122+
}
112123
}
113124

114125
static SimilarityComputer ofFloatArrayProperty(String name, NodePropertyValues properties, SimilarityMetric metric) {

algo/src/main/java/org/neo4j/gds/similarity/knn/metrics/SimilarityMetric.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,9 @@
2525
import static org.neo4j.gds.utils.StringFormatting.toUpperCaseWithLocale;
2626

2727
public enum SimilarityMetric {
28-
JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON, LONG_PROPERTY_METRIC, DOUBLE_PROPERTY_METRIC, DEFAULT;
28+
JACCARD, OVERLAP, COSINE, EUCLIDEAN, PEARSON,
29+
NORMALIZED_ABSOLUTE_DIFFERENCE, DOUBLE_PROPERTY_METRIC,
30+
HAMMING_DISTANCE, DEFAULT;
2931

3032
public static SimilarityMetric parse(String value) {
3133
return SimilarityMetric.valueOf(toUpperCaseWithLocale(value));
@@ -34,7 +36,7 @@ public static SimilarityMetric parse(String value) {
3436
public static SimilarityMetric defaultMetricForType(ValueType valueType) {
3537
switch (valueType) {
3638
case LONG:
37-
return LONG_PROPERTY_METRIC;
39+
return NORMALIZED_ABSOLUTE_DIFFERENCE;
3840
case DOUBLE:
3941
return DOUBLE_PROPERTY_METRIC;
4042
case DOUBLE_ARRAY:

algo/src/test/java/org/neo4j/gds/similarity/filteredknn/FilteredGenerateRandomNeighborsTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ public long size() {
6868
idMap,
6969
"myProperty",
7070
nodeProperties,
71-
SimilarityMetric.LONG_PROPERTY_METRIC
71+
SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE
7272
);
7373

7474
var random = new SplittableRandom();

algo/src/test/java/org/neo4j/gds/similarity/knn/GenerateRandomNeighborsTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ public long size() {
6767
idMap,
6868
"myProperty",
6969
nodeProperties,
70-
SimilarityMetric.LONG_PROPERTY_METRIC
70+
SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE
7171
);
7272

7373
var random = new SplittableRandom();
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.similarity.knn.metrics;
21+
22+
import org.junit.jupiter.api.Test;
23+
24+
import static org.junit.jupiter.api.Assertions.assertEquals;
25+
26+
class HammingDistanceTest {
27+
@Test
28+
void shouldReturnFullCorrelationWhenArgsAreIdentical() {
29+
double dist = HammingDistance.longMetric(12345L, 12345L);
30+
31+
assertEquals(1.0, dist);
32+
}
33+
34+
@Test
35+
void shouldReturnCorrectCorrelation() {
36+
double dist = HammingDistance.longMetric(12345L, 54321L);
37+
38+
assertEquals(0.921875, dist);
39+
}
40+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright (c) "Neo4j"
3+
* Neo4j Sweden AB [http://neo4j.com]
4+
*
5+
* This file is part of Neo4j.
6+
*
7+
* Neo4j is free software: you can redistribute it and/or modify
8+
* it under the terms of the GNU General Public License as published by
9+
* the Free Software Foundation, either version 3 of the License, or
10+
* (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful,
13+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
* GNU General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
*/
20+
package org.neo4j.gds.similarity.knn.metrics;
21+
22+
import org.junit.jupiter.api.Test;
23+
24+
import static org.junit.jupiter.api.Assertions.assertEquals;
25+
26+
class NormalizedAbsoluteDifferenceTest {
27+
@Test
28+
void shouldComputeNormalizedAbsoluteDifference() {
29+
double diff = NormalizedAbsoluteDifference.longMetric(1L, 2L);
30+
31+
assertEquals(1.0, diff);
32+
}
33+
}

algo/src/test/java/org/neo4j/gds/similarity/knn/metrics/SimilarityComputerTest.java

+15-4
Original file line numberDiff line numberDiff line change
@@ -66,16 +66,22 @@ void doublePropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentV
6666
}
6767

6868
@Property
69-
void longPropertySimilarityReturns1ForEqualValues(@ForAll @Positive long id) {
69+
void longPropertySimilarityReturns1ForEqualValues(
70+
@ForAll @Positive long id,
71+
@ForAll @From("longMetrics") SimilarityMetric similarityMetric
72+
) {
7073
NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId);
71-
var sim = SimilarityComputer.ofLongProperty(props);
74+
var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric);
7275
assertThat(sim.similarity(id, id)).isEqualTo(1.0);
7376
}
7477

7578
@Property
76-
void longPropertySimilarityReturnsValuesBetween0And1(@ForAll @From("differentValues") LongLongPair ids) {
79+
void longPropertySimilarityReturnsValuesBetween0And1(
80+
@ForAll @From("differentValues") LongLongPair ids,
81+
@ForAll @From("longMetrics") SimilarityMetric similarityMetric
82+
) {
7783
NodePropertyValues props = new LongTestPropertyValues(nodeId -> nodeId);
78-
var sim = SimilarityComputer.ofLongProperty(props);
84+
var sim = SimilarityComputer.ofLongProperty("", props, similarityMetric);
7985
assertThat(sim.similarity(ids.getOne(), ids.getTwo())).isStrictlyBetween(0.0, 1.0);
8086
}
8187

@@ -305,6 +311,11 @@ final Arbitrary<LongLongPair> differentValues() {
305311
.map(n2 -> PrimitiveTuples.pair((long) n1, (long) n2)));
306312
}
307313

314+
@Provide("longMetrics")
315+
final Arbitrary<SimilarityMetric> longMetrics() {
316+
return Arbitraries.of(SimilarityMetric.NORMALIZED_ABSOLUTE_DIFFERENCE, SimilarityMetric.HAMMING_DISTANCE);
317+
}
318+
308319
@Provide("longArrayMetrics")
309320
final Arbitrary<SimilarityMetric> longArrayMetrics() {
310321
return Arbitraries.of(SimilarityMetric.JACCARD, SimilarityMetric.OVERLAP);

0 commit comments

Comments
 (0)