From b193b7012ca0460548502c576c9f5da4b6d6a242 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sun, 24 Aug 2025 14:49:22 -0700 Subject: [PATCH 01/26] Set pom to min java version of 22. Set minimum Maven version to 3.9.11. Remove obsolete ds-memory reference. --- pom.xml | 5 ++--- src/main/java/org/apache/datasketches/hll/HllSketch.java | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pom.xml b/pom.xml index 972d21449..75247f638 100644 --- a/pom.xml +++ b/pom.xml @@ -83,7 +83,6 @@ under the License. - 6.1.0-SNAPSHOT 7.11.0 @@ -94,7 +93,7 @@ under the License. check_cpp_historical_files - 3.9.10 + 3.9.11 24 -Xmx4g -Duser.language=en -Duser.country=US -Dfile.encoding=UTF-8 ${java.version} @@ -184,7 +183,7 @@ under the License. - [24,) + [22,) [${maven.version},4.0.0) diff --git a/src/main/java/org/apache/datasketches/hll/HllSketch.java b/src/main/java/org/apache/datasketches/hll/HllSketch.java index 9709ad69d..35d782a27 100644 --- a/src/main/java/org/apache/datasketches/hll/HllSketch.java +++ b/src/main/java/org/apache/datasketches/hll/HllSketch.java @@ -238,7 +238,7 @@ static final HllSketch heapify(final MemorySegment srcSeg, final boolean checkRe *

The given dstSeg is checked for the required capacity as determined by * {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}. * @param srcWseg an writable image of a valid source sketch with data. - * @return an HllSketch where the sketch data is in the given dstSeg. + * @return an HllSketch where the sketch data is in the given srcWseg. */ public static final HllSketch writableWrap(final MemorySegment srcWseg) { if (srcWseg.isReadOnly()) { return wrap(srcWseg); } @@ -251,7 +251,7 @@ static final HllSketch writableWrap( final MemorySegment srcWseg, final boolean checkBounds(0, 8, srcWseg.byteSize()); //need min 8 bytes if (extractCompactFlag(srcWseg)) { throw new SketchesArgumentException( - "Cannot perform a writableWrap of a writable sketch image that is in compact form. " + "Cannot perform a writableWrap of a sketch image that is in compact form. " + "Compact sketches are by definition immutable."); } final int lgConfigK = extractLgK(srcWseg); From 3388762d927e1f28a8fd814d9fb02db93352aa36 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 25 Aug 2025 14:13:26 -0700 Subject: [PATCH 02/26] Add tests, minor fixes to improve testing. Fixed Javadocs. --- .../datasketches/kll/KllDoublesSketch.java | 6 +- .../datasketches/kll/KllFloatsSketch.java | 6 +- .../datasketches/kll/KllLongsSketch.java | 8 +-- .../datasketches/theta/CompactSketch.java | 14 ++--- .../datasketches/theta/CompactSketchTest.java | 20 ++++++- .../theta/ThetaSketchCrossLanguageTest.java | 60 ++++++++++++++++--- 6 files changed, 89 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index 45a6ae8bb..0e4b48794 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -144,9 +144,7 @@ public static KllDoublesSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllDoublesSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, DOUBLES_SKETCH); - return new KllDirectDoublesSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -386,6 +384,8 @@ else if (weight < levelsArr[0]) { /** * Vector update. Updates this sketch with the given array (vector) of items, starting at the items * offset for a length number of items. This is not supported for direct sketches. + *

Note: a single occurrence of a NaN in the array will force this method to use the conventional update path + * rather than the fast update path.

* @param items the vector of items * @param offset the starting index of the items[] array * @param length the number of items diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index 35c8711ed..d41abb891 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -144,9 +144,7 @@ public static KllFloatsSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllFloatsSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, FLOATS_SKETCH); - return new KllDirectFloatsSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -386,6 +384,8 @@ else if (weight < levelsArr[0]) { /** * Vector update. Updates this sketch with the given array (vector) of items, starting at the items * offset for a length number of items. This is not supported for direct sketches. + *

Note: a single occurrence of a NaN in the array will force this method to use the conventional update path + * rather than the fast update path.

* @param items the vector of items * @param offset the starting index of the items[] array * @param length the number of items diff --git a/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java b/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java index c5ada70b4..6acf93799 100644 --- a/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllLongsSketch.java @@ -144,9 +144,7 @@ public static KllLongsSketch heapify(final MemorySegment srcSeg) { * @return an instance of this sketch that wraps the given MemorySegment. */ public static KllLongsSketch wrap(final MemorySegment srcSeg) { - Objects.requireNonNull(srcSeg, "Parameter 'srcSeg' must not be null"); - final KllMemorySegmentValidate segVal = new KllMemorySegmentValidate(srcSeg, LONGS_SKETCH); - return new KllDirectLongsSketch(srcSeg, segVal, null); + return wrap(srcSeg, null); } /** @@ -364,7 +362,7 @@ final void updateMinMax(final long item) { /** * Weighted update. Updates this sketch with the given item the number of times specified by the given integer weight. - * @param item the item to be repeated. NaNs are ignored. + * @param item the item to be repeated. * @param weight the number of times the update of item is to be repeated. It must be ≥ one. */ public void update(final long item, final long weight) { @@ -409,6 +407,8 @@ public void update(final long[] items, final int offset, final int length) { + + */ private void updateLong(final long[] srcItems, final int srcOffset, final int length) { if (isEmpty()) { diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index edd55165c..1dcec1a97 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -228,7 +228,7 @@ else if (serVer == 2) { } /** - * Wrap takes the sketch image in the given MemorySegment and refers to it directly. + * Wrap takes the sketch image in the given byte array and refers to it directly. * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * @@ -241,20 +241,20 @@ else if (serVer == 2) { * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

This method checks if the DEFAULT_UPDATE_SEED was used to create the source MemorySegment image. + *

This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image. * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, * so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.

* * @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED. * - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes) { return wrap(bytes, Util.DEFAULT_UPDATE_SEED, false); } /** - * Wrap takes the sketch image in the given MemorySegment and refers to it directly. + * Wrap takes the sketch image in the given byte array and refers to it directly. * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * @@ -267,14 +267,14 @@ public static CompactSketch wrap(final byte[] bytes) { * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

This method checks if the given expectedSeed was used to create the source MemorySegment image. + *

This method checks if the given expectedSeed was used to create the source byte array image. * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.

* * @param bytes a byte array image of a Sketch that was created using the given expectedSeed. - * @param expectedSeed the seed used to validate the given MemorySegment image. + * @param expectedSeed the seed used to validate the given byte array image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) { return wrap(bytes, expectedSeed, true); diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java index 6d9c173a0..8ef889be4 100644 --- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java @@ -582,7 +582,7 @@ public void serializeDeserializeHeapV4() { } @Test - public void serializeDeserializeDirectV4() { + public void serializeDeserializeDirectV4_segment() { final UpdateSketch sk = Sketches.updateSketchBuilder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); @@ -598,6 +598,24 @@ public void serializeDeserializeDirectV4() { } } + @Test + public void serializeDeserializeDirectV4_bytes() { + final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + for (int i = 0; i < 10000; i++) { + sk.update(i); + } + final CompactSketch cs1 = sk.compact(true, MemorySegment.ofArray(new byte[sk.getCompactBytes()])); + final byte[] bytes = cs1.toByteArrayCompressed(); + final CompactSketch cs2 = CompactSketch.wrap(bytes); + assertEquals(cs1.getRetainedEntries(), cs2.getRetainedEntries()); + final HashIterator it1 = cs1.iterator(); + final HashIterator it2 = cs2.iterator(); + while (it1.next() && it2.next()) { + assertEquals(it2.get(), it2.get()); + } + } + + @Test public void serializeWrapBytesV3() { final UpdateSketch sk = Sketches.updateSketchBuilder().build(); diff --git a/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java b/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java index 7d69b3832..64449027a 100644 --- a/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java +++ b/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java @@ -45,7 +45,7 @@ public class ThetaSketchCrossLanguageTest { @Test(groups = {GENERATE_JAVA_FILES}) public void generateBinariesForCompatibilityTesting() throws IOException { final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000}; - for (int n: nArr) { + for (final int n: nArr) { final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < n; i++) { sk.update(i); @@ -57,7 +57,7 @@ public void generateBinariesForCompatibilityTesting() throws IOException { @Test(groups = {GENERATE_JAVA_FILES}) public void generateBinariesForCompatibilityTestingCompressed() throws IOException { final int[] nArr = {10, 100, 1000, 10_000, 100_000, 1_000_000}; - for (int n: nArr) { + for (final int n: nArr) { final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < n; i++) { sk.update(i); @@ -76,9 +76,9 @@ public void generateBinariesForCompatibilityTestingNonEmptyNoEntries() throws IO } @Test(groups = {CHECK_CPP_FILES}) - public void deserializeFromCpp() throws IOException { + public void deserializeFromCppSegment() throws IOException { final int[] nArr = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; - for (int n: nArr) { + for (final int n: nArr) { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_n" + n + "_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); @@ -95,9 +95,28 @@ public void deserializeFromCpp() throws IOException { } @Test(groups = {CHECK_CPP_FILES}) - public void deserializeFromCppCompressed() throws IOException { + public void deserializeFromCppBytes() throws IOException { + final int[] nArr = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (final int n: nArr) { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_n" + n + "_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); + assertEquals(sketch.getEstimate(), n, n * 0.03); + assertTrue(sketch.isOrdered()); + final HashIterator it = sketch.iterator(); + long previous = 0; + while (it.next()) { + assertTrue(it.get() < sketch.getThetaLong()); + assertTrue(it.get() > previous); + previous = it.get(); + } + } + } + + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppCompressedSegment() throws IOException { final int[] nArr = {10, 100, 1000, 10000, 100000, 1000000}; - for (int n: nArr) { + for (final int n: nArr) { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_compressed_n" + n + "_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); @@ -114,11 +133,38 @@ public void deserializeFromCppCompressed() throws IOException { } @Test(groups = {CHECK_CPP_FILES}) - public void deserializeFromCppNonEmptyNoEntries() throws IOException { + public void deserializeFromCppCompressedBytes() throws IOException { + final int[] nArr = {10, 100, 1000, 10000, 100000, 1000000}; + for (final int n: nArr) { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_compressed_n" + n + "_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); + assertEquals(sketch.getEstimate(), n, n * 0.03); + assertTrue(sketch.isOrdered()); + final HashIterator it = sketch.iterator(); + long previous = 0; + while (it.next()) { + assertTrue(it.get() < sketch.getThetaLong()); + assertTrue(it.get() > previous); + previous = it.get(); + } + } + } + + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppNonEmptyNoEntriesSegment() throws IOException { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_non_empty_no_entries_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertFalse(sketch.isEmpty()); assertEquals(sketch.getRetainedEntries(), 0); } + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppNonEmptyNoEntriesBytes() throws IOException { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_non_empty_no_entries_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertFalse(sketch.isEmpty()); + assertEquals(sketch.getRetainedEntries(), 0); + } + } From 5e2e17ad0ceca6c26733e6c7e97d24fb941a7754 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 29 Aug 2025 17:07:20 -0700 Subject: [PATCH 03/26] Removed deprecated JVM arguments. --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 75247f638..b1d919721 100644 --- a/pom.xml +++ b/pom.xml @@ -95,7 +95,7 @@ under the License. 3.9.11 24 - -Xmx4g -Duser.language=en -Duser.country=US -Dfile.encoding=UTF-8 + -Xmx4g ${java.version} ${java.version} UTF-8 From ecff86a8ad356d5e57a5641c90eed468061e9ec2 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sat, 30 Aug 2025 09:42:18 -0700 Subject: [PATCH 04/26] update pom --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index b1d919721..f06be3d36 100644 --- a/pom.xml +++ b/pom.xml @@ -83,7 +83,6 @@ under the License. - 7.11.0 @@ -96,8 +95,6 @@ under the License. 3.9.11 24 -Xmx4g - ${java.version} - ${java.version} UTF-8 ${charset.encoding} ${charset.encoding} @@ -156,8 +153,9 @@ under the License. maven-compiler-plugin ${maven-compiler-plugin.version} + ${java.version} - + @@ -275,6 +273,8 @@ under the License. maven-surefire-plugin ${maven-surefire-failsafe-plugins.version} + 1 + true ${jvm-arguments} false false From 26d0523d61bc8ab98f07de86a04447b997e35116 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sat, 30 Aug 2025 09:46:21 -0700 Subject: [PATCH 05/26] update pom --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index f06be3d36..ada136c87 100644 --- a/pom.xml +++ b/pom.xml @@ -94,7 +94,7 @@ under the License. 3.9.11 24 - -Xmx4g + -Xmx4g UTF-8 ${charset.encoding} ${charset.encoding} @@ -275,7 +275,7 @@ under the License. 1 true - ${jvm-arguments} + ${jvm-arguments} false false true From 6c9b690b35fad2c1375e86c6a4eb9a6d5278f498 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sat, 30 Aug 2025 13:21:10 -0700 Subject: [PATCH 06/26] update pom --- pom.xml | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/pom.xml b/pom.xml index ada136c87..1e0fe345b 100644 --- a/pom.xml +++ b/pom.xml @@ -66,8 +66,6 @@ under the License. - GitHub https://github.com/apache/${project.artifactId}/issues @@ -105,10 +103,10 @@ under the License. 3.7.1 3.14.0 3.1.4 - 3.6.0 + 3.6.1 3.2.8 3.4.2 - 3.11.2 + 3.11.3 3.1.1 3.3.1 @@ -584,25 +582,7 @@ under the License. - + generate-java-files From 999aaeecfb1e49848ebb06a166895dab629545c6 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 1 Sep 2025 17:25:13 -0700 Subject: [PATCH 07/26] more pom updates. --- pom.xml | 35 ++++++++++++++++--- .../filters/bloomfilter/package-info.java | 1 + 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 1e0fe345b..ddeccd196 100644 --- a/pom.xml +++ b/pom.xml @@ -92,7 +92,8 @@ under the License. 3.9.11 24 - -Xmx4g + + -Xmx4g UTF-8 ${charset.encoding} ${charset.encoding} @@ -102,6 +103,7 @@ under the License. 3.7.1 3.14.0 + 3.8.1 3.1.4 3.6.1 3.2.8 @@ -142,6 +144,7 @@ under the License. + org.apache.maven.plugins maven-assembly-plugin ${maven-assembly-plugin.version} @@ -151,13 +154,20 @@ under the License. maven-compiler-plugin ${maven-compiler-plugin.version} + true ${java.version} - + -J${jvm.options} + + org.apache.maven.plugins + maven-dependency-plugin + ${maven-dependency-plugin.version} + + @@ -182,8 +192,9 @@ under the License. [22,) - [${maven.version},4.0.0) + [${maven.version},) + @@ -227,6 +238,10 @@ under the License. true public all,-missing + ${java.version} + + -J${jvm.options} + @@ -273,7 +288,7 @@ under the License. 1 true - ${jvm-arguments} + ${jvm.options} false false true @@ -380,6 +395,18 @@ under the License. + + org.apache.maven.plugins + maven-assembly-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + + org.apache.maven.plugins + maven-dependency-plugin + org.apache.maven.plugins maven-deploy-plugin diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java index 5cc6f28fe..4823fcca7 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/package-info.java @@ -16,6 +16,7 @@ * specific language governing permissions and limitations * under the License. */ + /** * BloomFilter package */ From 9f85d8c9a05ba3224da8f6bca01d4719fe2e94e9 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 9 Sep 2025 16:24:08 -0700 Subject: [PATCH 08/26] Mostly corrected javadocs --- .../datasketches/theta/CompactSketch.java | 128 ++++++++++-------- .../theta/DirectQuickSelectSketchR.java | 13 +- .../org/apache/datasketches/theta/Sketch.java | 34 +++-- .../theta/WrappedCompactCompressedSketch.java | 3 +- .../theta/WrappedCompactSketch.java | 5 +- 5 files changed, 100 insertions(+), 83 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index 1dcec1a97..f597db7d1 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -190,41 +190,45 @@ private static CompactSketch wrap(final MemorySegment srcSeg, final long seed, f } final short seedHash = Util.computeSeedHash(seed); - if (serVer == 4) { - return DirectCompactCompressedSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } - else if (serVer == 3) { - if (PreambleUtil.isEmptyFlag(srcSeg)) { - return EmptyCompactSketch.getHeapInstance(srcSeg); + switch (serVer) { + case 1: { + return ForwardCompatibility.heapify1to3(srcSeg, seedHash); } - if (otherCheckForSingleItem(srcSeg)) { - return SingleItemSketch.heapify(srcSeg, enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + case 2: { + return ForwardCompatibility.heapify2to3(srcSeg, + enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); } - //not empty & not singleItem - final int flags = extractFlags(srcSeg); - final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; - if (!compactFlag) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have compact flag set"); + case 3: { + if (PreambleUtil.isEmptyFlag(srcSeg)) { + return EmptyCompactSketch.getHeapInstance(srcSeg); + } + if (otherCheckForSingleItem(srcSeg)) { + return SingleItemSketch.heapify(srcSeg, enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + } + //not empty & not singleItem + final int flags = extractFlags(srcSeg); + final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; + if (!compactFlag) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have compact flag set"); + } + final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; + if (!readOnly) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have Read-Only flag set"); + } + return DirectCompactSketch.wrapInstance(srcSeg, + enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); } - final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; - if (!readOnly) { + case 4: { + return DirectCompactCompressedSketch.wrapInstance(srcSeg, + enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + } + default: { throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have Read-Only flag set"); + "Corrupted: Serialization Version " + serVer + " not recognized."); } - return DirectCompactSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } //end of serVer 3 - else if (serVer == 1) { - return ForwardCompatibility.heapify1to3(srcSeg, seedHash); - } - else if (serVer == 2) { - return ForwardCompatibility.heapify2to3(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); } - throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); } /** @@ -232,7 +236,7 @@ else if (serVer == 2) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *

Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a heapify operation. * These early versions were never designed to "wrap".

@@ -242,7 +246,7 @@ else if (serVer == 2) { * This is actually faster and consumes less overall space.

* *

This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, + * Note that SerialVersion 1 (pre-open-source) sketches cannot be checked as they don't have a seedHash field, * so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.

* * @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED. @@ -258,7 +262,7 @@ public static CompactSketch wrap(final byte[] bytes) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *

Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a heapify operation. * These early versions were never designed to "wrap".

@@ -288,38 +292,46 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } final short seedHash = Util.computeSeedHash(seed); - if (serVer == 4) { - return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); - } else if (serVer == 3) { - final int flags = bytes[FLAGS_BYTE]; - if ((flags & EMPTY_FLAG_MASK) > 0) { - return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); + + switch (serVer) { + case 1: { + return ForwardCompatibility.heapify1to3(MemorySegment.ofArray(bytes), seedHash); } - final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + case 2: { + return ForwardCompatibility.heapify2to3(MemorySegment.ofArray(bytes), + enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); } - //not empty & not singleItem - final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; - if (!compactFlag) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have compact flag set"); + case 3: { + final int flags = bytes[FLAGS_BYTE]; + if ((flags & EMPTY_FLAG_MASK) > 0) { + return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); + } + final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; + if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + } + //not empty & not singleItem + final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; + if (!compactFlag) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have compact flag set"); + } + final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; + if (!readOnly) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have Read-Only flag set"); + } + return WrappedCompactSketch.wrapInstance(bytes, + enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + } + case 4: { + return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); } - final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; - if (!readOnly) { + default: { throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have Read-Only flag set"); + "Corrupted: Serialization Version " + serVer + " not recognized."); } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); - } else if (serVer == 1) { - return ForwardCompatibility.heapify1to3(MemorySegment.ofArray(bytes), seedHash); - } else if (serVer == 2) { - return ForwardCompatibility.heapify2to3(MemorySegment.ofArray(bytes), - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); } - throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); } //Sketch Overrides diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index 0a81f4887..9926c9b79 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -51,8 +51,7 @@ import org.apache.datasketches.thetacommon.ThetaUtil; /** - * The default Theta Sketch using the QuickSelect algorithm. - * This is the read-only implementation with non-functional methods, which affect the state. + * The read-only Theta Sketch using the QuickSelect algorithm. * *

This implementation uses data in a given MemorySegment that is owned and managed by the caller. * This MemorySegment can be off-heap, which if managed properly will greatly reduce the need for @@ -65,17 +64,16 @@ class DirectQuickSelectSketchR extends UpdateSketch { static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space final long seed_; //provided, kept only on heap, never serialized. int hashTableThreshold_; //computed, kept only on heap, never serialized. - MemorySegment wseg_; //A MemorySegment for child class, but no write methods here + MemorySegment wseg_; //This reference is shared with the writable child class, but no write methods here - //only called by DirectQuickSelectSketch and below + //only called by the writable DirectQuickSelectSketch and this class. DirectQuickSelectSketchR(final long seed, final MemorySegment wseg) { seed_ = seed; wseg_ = wseg; } /** - * Wrap a sketch around the given source MemorySegment containing sketch data that originated from - * this sketch. + * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg the source MemorySegment. * The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed @@ -89,8 +87,7 @@ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final l UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); - final DirectQuickSelectSketchR dqssr = - new DirectQuickSelectSketchR(seed, srcSeg); + final DirectQuickSelectSketchR dqssr = new DirectQuickSelectSketchR(seed, srcSeg); dqssr.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqssr; } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 6310d82c4..e551f33c4 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -63,7 +63,7 @@ public abstract class Sketch implements MemorySegmentStatus { * was used to create the source MemorySegment image. * *

For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.

+ * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.

* * @param srcSeg an image of a Sketch. * @@ -83,8 +83,12 @@ public static Sketch heapify(final MemorySegment srcSeg) { * *

The resulting sketch will not retain any link to the source MemorySegment.

* - *

For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.

+ *

For Update Sketches this method checks if the + * Default Update Seed

+ * was used to create the source MemorySegment image. + * + *

For Compact Sketches this method assumes that the sketch image was created with the + * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.

* * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -109,8 +113,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".

+ * where all data will be copied to the heap. These early versions were never designed to "wrap".

* *

Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. @@ -121,10 +124,10 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * was used to create the source MemorySegment image. * *

For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.

+ * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked.

* - * @param srcSeg an image of a Sketch. - * @return a Sketch backed by the given MemorySegment + * @param srcSeg a MemorySegment with an image of a Sketch. + * @return a read-only Sketch backed by the given MemorySegment */ public static Sketch wrap(final MemorySegment srcSeg) { final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; @@ -154,20 +157,23 @@ public static Sketch wrap(final MemorySegment srcSeg) { *

Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".

+ * where all data will be copied to the heap. These early versions were never designed to "wrap".

* *

Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

* - *

For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.

+ *

For Update Sketches this method checks if the + * Default Update Seed

+ * was used to create the source MemorySegment image. + * + *

For Compact Sketches this method assumes that the sketch image was created with the + * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked.

* * @param srcSeg a MemorySegment with an image of a Sketch. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a UpdateSketch backed by the given MemorySegment except as above. + * @return a read-only Sketch backed by the given MemorySegment. */ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; @@ -203,7 +209,7 @@ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { * @return this sketch as an ordered CompactSketch. */ public CompactSketch compact() { - return (this.isCompact()) ? (CompactSketch)this : compact(true, null); + return isCompact() ? (CompactSketch)this : compact(true, null); } /** diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java index e9a952ab4..c4affc9ce 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java @@ -28,7 +28,8 @@ import org.apache.datasketches.common.Util; /** - * Wrapper around a serialized compact compressed read-only sketch. It is not empty, not a single item. + * A wrapper around a serialized compact compressed read-only sketch in the form of a byte array. + * It is not an empty nor a single item sketch. * *

This sketch can only be associated with a Serialization Version 4 format binary image.

*/ diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java index a5b67363f..08726a7ff 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java @@ -37,9 +37,10 @@ import org.apache.datasketches.common.Util; /** - * Wrapper around a serialized compact read-only sketch. It is not empty, not a single item. + * A wrapper around a serialized compact read-only sketch in the form of a byte array. + * It is not an empty nor a single item sketch. * - *

This sketch can only be associated with a Serialization Version 3 format binary image.

+ *

This sketch can only be associated with a Serialization Version 3 binary image format.

*/ class WrappedCompactSketch extends CompactSketch { final byte[] bytes_; From 88d603729132ab67dde0193fe42b22a8543521ae Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 12 Sep 2025 18:03:28 -0700 Subject: [PATCH 09/26] Speed and code comment suggestions from GPT review. Fix typo. --- .../theta/DirectQuickSelectSketchR.java | 30 ++++----- .../datasketches/theta/HeapUpdateSketch.java | 14 ++--- .../datasketches/theta/UpdateSketch.java | 62 ++++++++++--------- 3 files changed, 48 insertions(+), 58 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index 9926c9b79..01872eb71 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -62,13 +62,12 @@ */ class DirectQuickSelectSketchR extends UpdateSketch { static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space - final long seed_; //provided, kept only on heap, never serialized. int hashTableThreshold_; //computed, kept only on heap, never serialized. MemorySegment wseg_; //This reference is shared with the writable child class, but no write methods here //only called by the writable DirectQuickSelectSketch and this class. DirectQuickSelectSketchR(final long seed, final MemorySegment wseg) { - seed_ = seed; + super(seed); wseg_ = wseg; } @@ -100,8 +99,8 @@ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final l * @return instance of this sketch */ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int lgNomLongs = srcSeg.get(JAVA_BYTE, LG_NOM_LONGS_BYTE) & 0XFF; - final int lgArrLongs = srcSeg.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; + final int lgNomLongs = srcSeg.get(JAVA_BYTE, LG_NOM_LONGS_BYTE) & 0XFF; //mask to byte + final int lgArrLongs = srcSeg.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte final DirectQuickSelectSketchR dqss = new DirectQuickSelectSketchR(seed, srcSeg); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); @@ -114,8 +113,8 @@ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, fin public int getCurrentBytes() { //not compact final byte lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE); - final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - return (preLongs + (1 << lgArrLongs)) << 3; + final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits + return preLongs + (1 << lgArrLongs) << 3; } @Override @@ -127,7 +126,7 @@ public double getEstimate() { @Override public Family getFamily() { - final int familyID = wseg_.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; + final int familyID = wseg_.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; //mask to byte return Family.idToFamily(familyID); } @@ -143,7 +142,7 @@ public long getThetaLong() { @Override public boolean hasMemorySegment() { - return (wseg_ != null) && wseg_.scope().isAlive(); + return wseg_ != null && wseg_.scope().isAlive(); } @Override @@ -196,11 +195,6 @@ public ResizeFactor getResizeFactor() { return ResizeFactor.getRF(getLgRF()); } - @Override - long getSeed() { - return seed_; - } - @Override public UpdateSketch rebuild() { throw new SketchesReadOnlyException(); @@ -215,8 +209,8 @@ public void reset() { @Override long[] getCache() { - final long lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; - final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; + final long lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte + final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits final long[] cacheArr = new long[1 << lgArrLongs]; MemorySegment.copy(wseg_, JAVA_LONG_UNALIGNED, preambleLongs << 3, cacheArr, 0, 1 << lgArrLongs); return cacheArr; @@ -254,11 +248,11 @@ boolean isOutOfSpace(final int numEntries) { @Override int getLgArrLongs() { - return wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; + return wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte } int getLgRF() { //only Direct needs this - return (wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3; + return wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; //mask to 2 bits } @Override @@ -277,7 +271,7 @@ UpdateReturnState hashUpdate(final long hash) { protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, //but this allows us to tune these constants for different sketches. - final double fraction = (lgArrLongs <= lgNomLongs) ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; + final double fraction = lgArrLongs <= lgNomLongs ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; return (int) (fraction * (1 << lgArrLongs)); } diff --git a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java index 87e1892b8..aff348281 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java @@ -49,13 +49,12 @@ */ abstract class HeapUpdateSketch extends UpdateSketch { final int lgNomLongs_; - private final long seed_; private final float p_; private final ResizeFactor rf_; HeapUpdateSketch(final int lgNomLongs, final long seed, final float p, final ResizeFactor rf) { + super(seed); lgNomLongs_ = Math.max(lgNomLongs, ThetaUtil.MIN_LG_NOM_LONGS); - seed_ = seed; p_ = p; rf_ = rf; } @@ -66,7 +65,7 @@ abstract class HeapUpdateSketch extends UpdateSketch { public int getCurrentBytes() { final int preLongs = getCurrentPreambleLongs(); final int dataLongs = getCurrentDataLongs(); - return (preLongs + dataLongs) << 3; + return preLongs + dataLongs << 3; } //UpdateSketch @@ -86,11 +85,6 @@ public ResizeFactor getResizeFactor() { return rf_; } - @Override - long getSeed() { - return seed_; - } - //restricted methods @Override @@ -102,14 +96,14 @@ short getSeedHash() { byte[] toByteArray(final int preLongs, final byte familyID) { if (isDirty()) { rebuild(); } checkIllegalCurCountAndEmpty(isEmpty(), getRetainedEntries(true)); - final int preBytes = (preLongs << 3) & 0X3F; //24 bytes + final int preBytes = preLongs << 3 & 0X3F; //24 bytes; mask to 6 bits final int dataBytes = getCurrentDataLongs() << 3; final byte[] byteArrOut = new byte[preBytes + dataBytes]; final MemorySegment segOut = MemorySegment.ofArray(byteArrOut); //preamble first 8 bytes. Note: only compact can be reduced to 8 bytes. - final int lgRf = getResizeFactor().lg() & 0x3; + final int lgRf = getResizeFactor().lg() & 0x3; //mask to 2 bits insertPreLongs(segOut, preLongs); //byte 0 low 6 bits insertLgResizeFactor(segOut, lgRf); //byte 0 high 2 bits insertSerVer(segOut, SER_VER); //byte 1 diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 7db1988e9..4635e75a0 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -60,8 +60,11 @@ * @author Lee Rhodes */ public abstract class UpdateSketch extends Sketch { + private final long seed_; - UpdateSketch() {} + UpdateSketch(final long seed) { + seed_ = seed; //kept only on heap, never serialized. Hoisted here for performance. + } /** * Wrap takes the writable sketch image in MemorySegment and refers to it directly. There is no data copying onto @@ -91,17 +94,17 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg) { * @return a UpdateSketch backed by the given MemorySegment */ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expectedSeed) { - Objects.requireNonNull(srcWSeg, "Source MemorySeg e t must not be null"); + Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null"); checkBounds(0, 24, srcWSeg.byteSize()); //need min 24 bytes - final int preLongs = srcWSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcWSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcWSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; + final int preLongs = srcWSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits + final int serVer = srcWSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; //mask to byte + final int familyID = srcWSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; //mask to byte final Family family = Family.idToFamily(familyID); if (family != Family.QUICKSELECT) { throw new SketchesArgumentException( "A " + family + " sketch cannot be wrapped as an UpdateSketch."); } - if ((serVer == 3) && (preLongs == 3)) { + if (serVer == 3 && preLongs == 3) { return DirectQuickSelectSketch.writableWrap(srcWSeg, expectedSeed); } else { throw new SketchesArgumentException( @@ -150,7 +153,7 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstWS public int getCompactBytes() { final int preLongs = getCompactPreambleLongs(); final int dataLongs = getRetainedEntries(true); - return (preLongs + dataLongs) << 3; + return preLongs + dataLongs << 3; } @Override @@ -160,7 +163,7 @@ int getCurrentDataLongs() { @Override public boolean hasMemorySegment() { - return ((this instanceof DirectQuickSelectSketchR) && ((DirectQuickSelectSketchR)this).hasMemorySegment()); + return this instanceof DirectQuickSelectSketchR && ((DirectQuickSelectSketchR)this).hasMemorySegment(); } @Override @@ -170,7 +173,7 @@ public boolean isCompact() { @Override public boolean isOffHeap() { - return ((this instanceof DirectQuickSelectSketchR) && ((DirectQuickSelectSketchR)this).isOffHeap()); + return this instanceof DirectQuickSelectSketchR && ((DirectQuickSelectSketchR)this).isOffHeap(); } @Override @@ -180,7 +183,7 @@ public boolean isOrdered() { @Override public boolean isSameResource(final MemorySegment that) { - return (this instanceof final DirectQuickSelectSketchR dqssr) && dqssr.isSameResource(that); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.isSameResource(that); } //UpdateSketch interface @@ -210,7 +213,7 @@ public static final UpdateSketchBuilder builder() { * Gets the configured seed * @return the configured seed */ - abstract long getSeed(); + public long getSeed() { return seed_; } /** * Resets this sketch back to a virgin empty state. @@ -232,8 +235,7 @@ public static final UpdateSketchBuilder builder() { * See Update Return State */ public UpdateReturnState update(final long datum) { - final long[] data = { datum }; - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(datum, seed_)[0] >>> 1); } /** @@ -248,9 +250,9 @@ public UpdateReturnState update(final long datum) { * See Update Return State */ public UpdateReturnState update(final double datum) { - final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0 - final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN & +/- infinity forms - return hashUpdate(hash(data, getSeed())[0] >>> 1); + final double d = datum == 0.0 ? 0.0 : datum; // canonicalize -0.0, 0.0 + final long data = Double.doubleToLongBits(d);// canonicalize all NaN & +/- infinity forms + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -267,11 +269,11 @@ public UpdateReturnState update(final double datum) { * See Update Return State */ public UpdateReturnState update(final String datum) { - if ((datum == null) || datum.isEmpty()) { + if (datum == null || datum.isEmpty()) { return RejectedNullOrEmpty; } final byte[] data = datum.getBytes(UTF_8); - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -283,10 +285,10 @@ public UpdateReturnState update(final String datum) { * See Update Return State */ public UpdateReturnState update(final byte[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -298,10 +300,10 @@ public UpdateReturnState update(final byte[] data) { * See Update Return State */ public UpdateReturnState update(final ByteBuffer buffer) { - if ((buffer == null) || !buffer.hasRemaining()) { + if (buffer == null || !buffer.hasRemaining()) { return RejectedNullOrEmpty; } - return hashUpdate(hash(buffer, getSeed())[0] >>> 1); + return hashUpdate(hash(buffer, seed_)[0] >>> 1); } /** @@ -316,10 +318,10 @@ public UpdateReturnState update(final ByteBuffer buffer) { * See Update Return State */ public UpdateReturnState update(final char[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -331,10 +333,10 @@ public UpdateReturnState update(final char[] data) { * See Update Return State */ public UpdateReturnState update(final int[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } /** @@ -346,10 +348,10 @@ public UpdateReturnState update(final int[] data) { * See Update Return State */ public UpdateReturnState update(final long[] data) { - if ((data == null) || (data.length == 0)) { + if (data == null || data.length == 0) { return RejectedNullOrEmpty; } - return hashUpdate(hash(data, getSeed())[0] >>> 1); + return hashUpdate(hash(data, seed_)[0] >>> 1); } //restricted methods @@ -455,7 +457,7 @@ static void checkSegIntegrity(final MemorySegment srcSeg, final long expectedSee final long thetaLong = extractThetaLong(srcSeg); //bytes 16-23 final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; //if (lgArrLongs <= lgNomLongs) the sketch is still resizing, thus theta cannot be < p. - if ((lgArrLongs <= lgNomLongs) && (theta < p) ) { + if (lgArrLongs <= lgNomLongs && theta < p ) { throw new SketchesArgumentException( "Possible corruption: Theta cannot be < p and lgArrLongs <= lgNomLongs. " + lgArrLongs + " <= " + lgNomLongs + ", Theta: " + theta + ", p: " + p); @@ -477,7 +479,7 @@ static boolean isResizeFactorIncorrect(final MemorySegment srcSeg, final int lgN final int lgA = lgArrLongs; final int lgR = extractLgResizeFactor(srcSeg); if (lgR == 0) { return lgA != lgT; } - return (((lgT - lgA) % lgR) != 0); + return (lgT - lgA) % lgR != 0; } } From 10e82733f2dd0a54f30ee2d35895a05664a7cd3a Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 22 Sep 2025 17:25:48 -0700 Subject: [PATCH 10/26] More corrections/improvements due to ChatGPT review. This includes a minor bug fix found by ChatGPT. --- pom.xml | 6 +- .../common/MemorySegmentStatus.java | 3 + .../apache/datasketches/theta/BitPacking.java | 6 +- .../datasketches/theta/CompactOperations.java | 6 +- .../theta/DirectQuickSelectSketch.java | 126 ++++++++---------- .../theta/DirectQuickSelectSketchR.java | 88 ++++++------ .../datasketches/theta/PreambleUtil.java | 30 +++-- .../org/apache/datasketches/theta/Sketch.java | 18 ++- .../datasketches/theta/UpdateSketch.java | 16 ++- .../theta/UpdateSketchBuilder.java | 16 +++ .../datasketches/thetacommon/ThetaUtil.java | 3 +- 11 files changed, 179 insertions(+), 139 deletions(-) diff --git a/pom.xml b/pom.xml index ddeccd196..e130f2694 100644 --- a/pom.xml +++ b/pom.xml @@ -91,7 +91,7 @@ under the License. 3.9.11 - 24 + 25 -Xmx4g UTF-8 @@ -189,7 +189,7 @@ under the License. - [22,) + [25,) [${maven.version},) @@ -288,7 +288,7 @@ under the License. 1 true - ${jvm.options} + ${argLine} ${jvm.options} false false true diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java b/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java index 6a4bde853..7e4ebcd38 100644 --- a/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java +++ b/src/main/java/org/apache/datasketches/common/MemorySegmentStatus.java @@ -20,6 +20,7 @@ package org.apache.datasketches.common; import java.lang.foreign.MemorySegment; +import java.util.Objects; import java.util.Optional; /** @@ -66,6 +67,8 @@ public interface MemorySegmentStatus { * @return true if the two given MemorySegments have to the same backing resource. */ static boolean isSameResource(final MemorySegment seg1, final MemorySegment seg2) { + Objects.requireNonNull(seg1, "MemorySegment seg1 must be non-null."); + Objects.requireNonNull(seg2, "MemorySegment seg2 must be non-null."); final Optional opt = seg1.asOverlappingSlice(seg2); return opt.isPresent(); } diff --git a/src/main/java/org/apache/datasketches/theta/BitPacking.java b/src/main/java/org/apache/datasketches/theta/BitPacking.java index cd7dfe1c9..fb8aa0619 100644 --- a/src/main/java/org/apache/datasketches/theta/BitPacking.java +++ b/src/main/java/org/apache/datasketches/theta/BitPacking.java @@ -24,7 +24,7 @@ /** * Used as part of Theta compression. */ -public final class BitPacking { +final class BitPacking { private BitPacking() { } @@ -36,7 +36,7 @@ private BitPacking() { } * @param bufOffset the byte offset in the buffer * @param bitOffset the bit offset */ - public static void packBits(final long value, int bits, final byte[] buffer, int bufOffset, final int bitOffset) { + static void packBits(final long value, int bits, final byte[] buffer, int bufOffset, final int bitOffset) { if (bitOffset > 0) { final int chunkBits = 8 - bitOffset; final int mask = (1 << chunkBits) - 1; @@ -65,7 +65,7 @@ public static void packBits(final long value, int bits, final byte[] buffer, int * @param bufOffset the buffer offset * @param bitOffset the bit offset */ - public static void unpackBits(final long[] value, final int index, int bits, final byte[] buffer, + static void unpackBits(final long[] value, final int index, int bits, final byte[] buffer, int bufOffset,final int bitOffset) { final int availBits = 8 - bitOffset; final int chunkBits = availBits <= bits ? availBits : bits; diff --git a/src/main/java/org/apache/datasketches/theta/CompactOperations.java b/src/main/java/org/apache/datasketches/theta/CompactOperations.java index 926600638..54ec7d10f 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactOperations.java +++ b/src/main/java/org/apache/datasketches/theta/CompactOperations.java @@ -324,7 +324,7 @@ static long[] compactCache(final long[] srcCache, final int curCount, */ /** - * This corrects a temporary anomalous condition where compact() is called on an UpdateSketch + * This corrects a temporary anomalous condition where compact() or toByteArray() is called on an UpdateSketch * that was initialized with p < 1.0 and update() was never called. In this case Theta < 1.0, * curCount = 0, and empty = true. The correction is to change Theta to 1.0, which makes the * returning sketch empty. This should only be used in the compaction or serialization of an @@ -347,8 +347,8 @@ static long correctThetaOnCompact(final boolean empty, final int curCount, * @param curCount the given current count */ //This handles #2 and #6 above static void checkIllegalCurCountAndEmpty(final boolean empty, final int curCount) { - if (empty && (curCount != 0)) { //this handles #2 and #6 above - throw new SketchesStateException("Illegal State: Empty=true and Current Count != 0."); + if (empty && curCount != 0) { //this handles #2 and #6 above + throw new SketchesStateException("Possible corruption. Illegal State: Empty=true and Current Count != 0."); } } diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 45b01edba..7a0d16cc6 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -62,6 +62,7 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.common.SuppressFBWarnings; import org.apache.datasketches.common.Util; import org.apache.datasketches.thetacommon.HashOperations; import org.apache.datasketches.thetacommon.ThetaUtil; @@ -78,15 +79,23 @@ * @author Kevin Lang */ class DirectQuickSelectSketch extends DirectQuickSelectSketchR { + private static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space + int hashTableThreshold_; //computed and mutable, kept only on heap, never serialized. + /** + * Construct this sketch as a result of a wrap operation where the given MemorySegment already has a sketch image. + * @param wseg the given MemorySegment that has a sketch image. + * @param seed See Update Hash Seed. + */ private DirectQuickSelectSketch( - final long seed, - final MemorySegment wseg) { - super(seed, wseg); + final MemorySegment wseg, + final long seed) { + super(wseg, seed); } /** * Construct a new sketch instance and initialize the given MemorySegment as its backing store. + * This is only called internally by other theta sketch classes. * * @param lgNomLongs See lgNomLongs. * @param seed See Update Hash Seed. @@ -106,42 +115,22 @@ private DirectQuickSelectSketch( final ResizeFactor rf, final MemorySegment dstSeg, final boolean unionGadget) { - this( - checkSegSize(lgNomLongs, rf, dstSeg, unionGadget), - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - lgNomLongs, - seed, - p, - rf, - dstSeg, - unionGadget); - } - private DirectQuickSelectSketch( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final int lgNomLongs, - final long seed, - final float p, - final ResizeFactor rf, - final MemorySegment dstSeg, - final boolean unionGadget) { - super(seed, dstSeg); //Choose family, preambleLongs - final Family family; - final int preambleLongs; - if (unionGadget) { - preambleLongs = Family.UNION.getMinPreLongs(); - family = Family.UNION; - } - else { - preambleLongs = Family.QUICKSELECT.getMinPreLongs(); - family = Family.QUICKSELECT; - } + final Family family = unionGadget ? Family.UNION : Family.QUICKSELECT; + final int preambleLongs = unionGadget ? Family.UNION.getMinPreLongs() : Family.QUICKSELECT.getMinPreLongs(); - //Choose RF, minReqBytes, lgArrLongs. + //Set RF, lgArrLongs. final int lgRF = rf.lg(); - final int lgArrLongs = (lgRF == 0) ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; + final int lgArrLongs = lgRF == 0 ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; + + //check Segment capacity + final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); + final long curSegCapBytes = dstSeg.byteSize(); + if (curSegCapBytes < minReqBytes) { + throw new SketchesArgumentException( + "MemorySegment capacity is too small: " + curSegCapBytes + " < " + minReqBytes); + } //@formatter:off //Build preamble @@ -157,29 +146,14 @@ private DirectQuickSelectSketch( insertP(dstSeg, p); //bytes 12-15 final long thetaLong = (long)(p * LONG_MAX_VALUE_AS_DOUBLE); insertThetaLong(dstSeg, thetaLong); //bytes 16-23 - if (unionGadget) { - insertUnionThetaLong(dstSeg, thetaLong); - } //@formatter:on + if (unionGadget) { insertUnionThetaLong(dstSeg, thetaLong); } + //clear hash table area dstSeg.asSlice(preambleLongs << 3, Long.BYTES << lgArrLongs).fill((byte)0); - hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - } - - private static final boolean checkSegSize( - final int lgNomLongs, final ResizeFactor rf, final MemorySegment dstSeg, final boolean unionGadget) { - final int preambleLongs = (unionGadget) ? Family.UNION.getMinPreLongs() : Family.QUICKSELECT.getMinPreLongs(); - final int lgRF = rf.lg(); - final int lgArrLongs = (lgRF == 0) ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); - final long curSegCapBytes = dstSeg.byteSize(); - if (curSegCapBytes < minReqBytes) { - throw new SketchesArgumentException( - "MemorySegment capacity is too small: " + curSegCapBytes + " < " + minReqBytes); - } - return true; + super(dstSeg, seed); } /** @@ -202,8 +176,7 @@ static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final lo insertLgResizeFactor(srcSeg, ResizeFactor.X2.lg()); } - final DirectQuickSelectSketch dqss = - new DirectQuickSelectSketch(seed, srcSeg); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -219,8 +192,7 @@ static DirectQuickSelectSketch fastWritableWrap(final MemorySegment srcSeg, fina final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - final DirectQuickSelectSketch dqss = - new DirectQuickSelectSketch(seed, srcSeg); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -274,20 +246,17 @@ UpdateReturnState hashUpdate(final long hash) { final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //The duplicate test - final int index = - HashOperations.hashSearchOrInsertMemorySegment(wseg_, lgArrLongs, hash, preambleLongs << 3); - if (index >= 0) { - return RejectedDuplicate; //Duplicate, not inserted - } + final int index = HashOperations.hashSearchOrInsertMemorySegment(wseg_, lgArrLongs, hash, preambleLongs << 3); + if (index >= 0) { return RejectedDuplicate; } //Duplicate, not inserted + //insertion occurred, increment curCount final int curCount = getRetainedEntries(true) + 1; wseg_.set(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT, curCount); //update curCount if (isOutOfSpace(curCount)) { //we need to do something, we are out of space - if (lgArrLongs > lgNomLongs) { //at full size, rebuild - //Assumes no dirty values, changes thetaLong, curCount_ - assert (lgArrLongs == (lgNomLongs + 1)) : "lgArr: " + lgArrLongs + ", lgNom: " + lgNomLongs; + if (lgArrLongs > lgNomLongs) { //at full size, rebuild, assumes no dirty values, changes thetaLong, curCount_ + assert lgArrLongs == lgNomLongs + 1 : "lgArr: " + lgArrLongs + ", lgNom: " + lgNomLongs; //rebuild, refresh curCount based on # values in the hashtable. quickSelectAndRebuild(wseg_, preambleLongs, lgNomLongs); return InsertedCountIncrementedRebuilt; @@ -305,23 +274,42 @@ UpdateReturnState hashUpdate(final long hash) { return InsertedCountIncrementedResized; } //end of Expand in current MemorySegment, exit. - else { - //Request more space, then resize. lgArrLongs will change; thetaLong, curCount will not + else { //Request larger segment, then resize. lgArrLongs will change; thetaLong, curCount will not final int preBytes = preambleLongs << 3; tgtLgArrLongs = Math.min(lgArrLongs + lgRF, lgNomLongs + 1); final int tgtArrBytes = 8 << tgtLgArrLongs; final int reqBytes = tgtArrBytes + preBytes; - final MemorySegment newDstSeg = MemorySegment.ofArray(new byte[reqBytes]); + final MemorySegment newDstSeg = MemorySegment.ofArray(new byte[reqBytes]); //always on-heap //TODO ADD MemSegReq moveAndResize(wseg_, preambleLongs, lgArrLongs, newDstSeg, tgtLgArrLongs, thetaLong); wseg_ = newDstSeg; hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, tgtLgArrLongs); return InsertedCountIncrementedResized; - } //end of Request more space to resize + } //end of request new segment & resize } //end of resize } //end of isOutOfSpace return InsertedCountIncremented; } + @Override + boolean isOutOfSpace(final int numEntries) { + return numEntries > hashTableThreshold_; + } + + /** + * Returns the cardinality limit given the current size of the hash table array. + * + * @param lgNomLongs See lgNomLongs. + * @param lgArrLongs See lgArrLongs. + * @return the hash table threshold + */ + @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") + protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { + //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, + //but this allows us to tune these constants for different sketches. + final double fraction = lgArrLongs <= lgNomLongs ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; + return (int) (fraction * (1 << lgArrLongs)); + } + } diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index 01872eb71..e4f7fcaf1 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -28,7 +28,7 @@ import static org.apache.datasketches.theta.CompactOperations.correctThetaOnCompact; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE; +//import static org.apache.datasketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_RESIZE_FACTOR_BIT; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.P_FLOAT; @@ -42,16 +42,15 @@ import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong; import java.lang.foreign.MemorySegment; +import java.util.Objects; import org.apache.datasketches.common.Family; import org.apache.datasketches.common.MemorySegmentStatus; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesReadOnlyException; -import org.apache.datasketches.common.SuppressFBWarnings; -import org.apache.datasketches.thetacommon.ThetaUtil; /** - * The read-only Theta Sketch using the QuickSelect algorithm. + * The read-only Theta Sketch. * *

This implementation uses data in a given MemorySegment that is owned and managed by the caller. * This MemorySegment can be off-heap, which if managed properly will greatly reduce the need for @@ -61,16 +60,41 @@ * @author Kevin Lang */ class DirectQuickSelectSketchR extends UpdateSketch { - static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space - int hashTableThreshold_; //computed, kept only on heap, never serialized. - MemorySegment wseg_; //This reference is shared with the writable child class, but no write methods here - //only called by the writable DirectQuickSelectSketch and this class. - DirectQuickSelectSketchR(final long seed, final MemorySegment wseg) { + /** + * This MemorySegment reference is also used by the writable child DirectQuickSelectSketch. + * + *

When this class is constructed with the writable constructor, called by the writable child DirectQuickSelectSketch, + * this reference can be changed, its contents can be modified.

+ * + *

When this class is constructed with the read-only constructor, called from local factories, this MemorySegment will + * be placed in read-only mode.

+ */ + MemorySegment wseg_; // + + /** + * This writable constructor is only called by the writable child DirectQuickSelectSketch and then this class provides the + * read-only methods for the DirectQuickSelectSketch class. + * @param wseg the writable MemorySegment used by the writable child DirectQuickSelectSketch. + * @param seed the seed for the update function for the writable child DirectQuickSelectSketch. + */ + DirectQuickSelectSketchR(final MemorySegment wseg, final long seed) { + Objects.requireNonNull(wseg, "MemorySegment wseg must not be null"); super(seed); wseg_ = wseg; } + /** + * This read-only constructor is only called by local factory methods which use this class as a read-only direct sketch. + * @param seed the seed used to validate the internal hashes of the given source MemorySegment. + * @param srcSeg the read-only MemorySegment used by this class in read-only mode. + */ + private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) { + Objects.requireNonNull(srcSeg, "MemorySegment srcSeg must not be null"); + super(seed); + wseg_ = srcSeg.asReadOnly(); + } + /** * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg the source MemorySegment. @@ -85,26 +109,19 @@ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final l UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); - - final DirectQuickSelectSketchR dqssr = new DirectQuickSelectSketchR(seed, srcSeg); - dqssr.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqssr; + return new DirectQuickSelectSketchR(seed, srcSeg); } /** * Fast-wrap a sketch around the given source MemorySegment containing sketch data that originated from * this sketch. This does NO validity checking of the given MemorySegment. + * Caller must ensure segment contents are a valid sketch image. * @param srcSeg The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed * @return instance of this sketch */ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int lgNomLongs = srcSeg.get(JAVA_BYTE, LG_NOM_LONGS_BYTE) & 0XFF; //mask to byte - final int lgArrLongs = srcSeg.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte - - final DirectQuickSelectSketchR dqss = new DirectQuickSelectSketchR(seed, srcSeg); - dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqss; + return new DirectQuickSelectSketchR(seed, srcSeg); } //Sketch @@ -112,7 +129,7 @@ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, fin @Override public int getCurrentBytes() { //not compact - final byte lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE); + final int lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits return preLongs + (1 << lgArrLongs) << 3; } @@ -131,7 +148,7 @@ public Family getFamily() { } @Override - public int getRetainedEntries(final boolean valid) { //always valid + public int getRetainedEntries(final boolean valid) { //always valid for theta return wseg_.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } @@ -157,7 +174,7 @@ public boolean isEmpty() { @Override public boolean isSameResource(final MemorySegment that) { - return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); + return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); //null checks done here } @Override @@ -167,14 +184,14 @@ public HashIterator iterator() { @Override public byte[] toByteArray() { //MY_FAMILY is stored in wseg_ - checkIllegalCurCountAndEmpty(isEmpty(), extractCurCount(wseg_)); + final int curCount = extractCurCount(wseg_); + checkIllegalCurCountAndEmpty(isEmpty(), curCount); final int lengthBytes = getCurrentBytes(); final byte[] byteArray = new byte[lengthBytes]; final MemorySegment seg = MemorySegment.ofArray(byteArray); MemorySegment.copy(wseg_, 0, seg, 0, lengthBytes); - final long thetaLong = - correctThetaOnCompact(isEmpty(), extractCurCount(wseg_), extractThetaLong(wseg_)); - insertThetaLong(wseg_, thetaLong); + final long thetaLong = correctThetaOnCompact(isEmpty(), curCount, extractThetaLong(wseg_)); + insertThetaLong(seg, thetaLong); return byteArray; } @@ -242,8 +259,8 @@ boolean isDirty() { } @Override - boolean isOutOfSpace(final int numEntries) { - return numEntries > hashTableThreshold_; + boolean isOutOfSpace(final int numEntries) { //overridden by writable DirectQuickSelectSketch + return false; } @Override @@ -260,19 +277,4 @@ UpdateReturnState hashUpdate(final long hash) { throw new SketchesReadOnlyException(); } - /** - * Returns the cardinality limit given the current size of the hash table array. - * - * @param lgNomLongs See lgNomLongs. - * @param lgArrLongs See lgArrLongs. - * @return the hash table threshold - */ - @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") - protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { - //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, - //but this allows us to tune these constants for different sketches. - final double fraction = lgArrLongs <= lgNomLongs ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; - return (int) (fraction * (1 << lgArrLongs)); - } - } diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index b3451fcd1..296fc7d5e 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -126,17 +126,17 @@ *
  * Long || Start Byte Adr:
  * Adr:
- *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
- *  0   ||    Seed Hash    | Flags  | numEB  | entBits| FamID  | SerVer |     PreLongs = 3   |
+ *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1     |   0              |
+ *  0   ||    Seed Hash    | Flags  | numEB  | entBits| FamID  | SerVer=4 |   PreLongs = 3   |
  *
- *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
- *  1   ||------------------------------THETA_LONG-------------------------------------------|
+ *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9     |   8              |
+ *  1   ||------------------------------THETA_LONG-------------------------------------------| (only if estimating)
  *
- *      ||        |        |        |  (20)  |  (19)  |  (18)  |  (17)  |    16              |
- *  2   ||----------------Retained Entries stored as 1 to 4 bytes----------------------------|
+ *      ||        |        |        |   20   |  (19)  |  (18)  |  (17)    |  16              |
+ *  2   ||--------Retained Entries stored as 1 to 4 bytes in bytes 16-19---------------------|
  *
- *      ||        |        |        |        |        |        |        |                    |
- *  3   ||------------------Delta encoded compressed byte array------------------------------|
+ *      ||        |        |        |        |        |        |          |                  |
+ *  3   ||--------Delta encoded compressed byte array starts at bytes 17-20------------------|
  *  
* *

The UpdateSketch and AlphaSketch require 24 bytes of preamble followed by a non-compact @@ -318,7 +318,7 @@ else if (preLongs == 4) { //Union sb.append("Byte 0: ResizeFactor : ").append(rfId + ", " + rf.toString()).append(LS); sb.append("Byte 1: Serialization Version: ").append(serVer).append(LS); sb.append("Byte 2: Family : ").append(familyId + ", " + family.toString()).append(LS); - sb.append("Byte 3: LgNomLongs : ").append(lgNomLongs).append(LS); + sb.append("Byte 3: LgNomLongs, LgK : ").append(lgNomLongs).append(LS); sb.append("Byte 4: LgArrLongs : ").append(lgArrLongs).append(LS); sb.append("Byte 5: Flags Field : ").append(flagsStr).append(LS); sb.append(" Bit Flag Name : State:").append(LS); @@ -351,8 +351,13 @@ else if (preLongs == 3) { sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); sb.append(" Theta (long) : ").append(thetaLong).append(LS); sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS); + if (serVer == 4) { + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); + sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); + return sb.toString(); + } } - else { //preLongs == 4 + else { //preLongs == 4 (Union) sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS); sb.append("Bytes 12-15: P : ").append(p).append(LS); sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); @@ -363,9 +368,8 @@ else if (preLongs == 3) { sb.append(" ThetaU (long,hex): ").append(thetaUHex).append(LS); } sb.append( "Preamble Bytes : ").append(preLongs * 8).append(LS); - sb.append( "Data Bytes : ").append(curCount * 8).append(LS); - sb.append( "TOTAL Sketch Bytes : ").append((preLongs + curCount) * 8).append(LS); - sb.append( "TOTAL Capacity Bytes : ").append(seg.byteSize()).append(LS); + sb.append( "Retained Data Bytes : ").append(curCount * 8).append(LS); + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); return sb.toString(); } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index e551f33c4..5c14b8fda 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -333,6 +333,16 @@ public static int getMaxUpdateSketchBytes(final int nomEntries) { return (nomEnt << 4) + (Family.QUICKSELECT.getMaxPreLongs() << 3); } + /** + * Returns the maximum number of storage bytes required for an UpdateSketch with the given + * log_base2 of the nominal entries. + * @param lgNomEntries log_base2 of Nominal Entries + * @return the maximum number of storage bytes required for a UpdateSketch with the given lgNomEntries + */ + public static int getUpdateSketchMaxBytes(final int lgNomEntries) { + return (1 << lgNomEntries << 4) + (Family.QUICKSELECT.getMaxPreLongs() << 3); + } + /** * Returns the number of valid entries that have been retained by the sketch. * @return the number of valid retained entries @@ -451,7 +461,10 @@ public String toString() { * @param hexMode If true, hashes will be output in hex. * @return The result string, which can be very long. */ - public String toString(final boolean sketchSummary, final boolean dataDetail, final int width, + public String toString( + final boolean sketchSummary, + final boolean dataDetail, + final int width, final boolean hexMode) { final StringBuilder sb = new StringBuilder(); @@ -554,6 +567,9 @@ public static String toString(final MemorySegment seg) { /** * Gets the internal cache array. For on-heap sketches this will return a reference to the actual * cache array. For MemorySegment-based sketches this returns a copy. + * + *

This can be an expensive operation and is intended for diagnostic & test applications. + * Use {@link #iterator() iterator()} instead.

* @return the internal cache array. */ abstract long[] getCache(); diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 4635e75a0..f080e976a 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -75,6 +75,9 @@ public abstract class UpdateSketch extends Sketch { * @param srcWSeg an image of a writable sketch where the image seed hash matches the default seed hash. * It must have a size of at least 24 bytes. * @return an UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch wrap(final MemorySegment srcWSeg) { return wrap(srcWSeg, Util.DEFAULT_UPDATE_SEED); @@ -92,6 +95,9 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg) { * See Update Hash Seed. * Compact sketches store a 16-bit hash of the seed, but not the seed itself. * @return a UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expectedSeed) { Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null"); @@ -118,6 +124,9 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expected * @param srcSeg the given MemorySegment with a sketch image. * It must have a size of at least 24 bytes. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg) { return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); @@ -130,6 +139,9 @@ public static UpdateSketch heapify(final MemorySegment srcSeg) { * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); @@ -163,7 +175,7 @@ int getCurrentDataLongs() { @Override public boolean hasMemorySegment() { - return this instanceof DirectQuickSelectSketchR && ((DirectQuickSelectSketchR)this).hasMemorySegment(); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.hasMemorySegment(); } @Override @@ -173,7 +185,7 @@ public boolean isCompact() { @Override public boolean isOffHeap() { - return this instanceof DirectQuickSelectSketchR && ((DirectQuickSelectSketchR)this).isOffHeap(); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.isOffHeap(); } @Override diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java index 834778f87..6c8d5f37f 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java @@ -117,6 +117,22 @@ public UpdateSketchBuilder setLogNominalEntries(final int lgNomEntries) { return this; } + /** + * Alternative method of setting the Nominal Entries for this sketch from the log_base2 value, + * commonly called LgK. + * This value is also used for building a shared concurrent sketch. + * The minimum value is 4 and the maximum value is 26. + * Be aware that sketches as large as 26 may not have been + * thoroughly characterized for performance. + * + * @param lgK the Log Nominal Entries. Also for the concurrent shared sketch + * @return this UpdateSketchBuilder + */ + public UpdateSketchBuilder setLgK(final int lgK) { + bLgNomLongs = ThetaUtil.checkNomLongs(1 << lgK); + return this; + } + /** * Returns Log-base 2 Nominal Entries * @return Log-base 2 Nominal Entries diff --git a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java index 4012cb412..778dc02f2 100644 --- a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java +++ b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java @@ -71,8 +71,7 @@ private ThetaUtil() {} * @param lgMin Log2 of the minimum allowed starting size * @return The Log2 of the starting size */ - public static int startingSubMultiple(final int lgTarget, final int lgRF, - final int lgMin) { + public static int startingSubMultiple(final int lgTarget, final int lgRF, final int lgMin) { return lgTarget <= lgMin ? lgMin : lgRF == 0 ? lgTarget : (lgTarget - lgMin) % lgRF + lgMin; } From fa97b7c3ee44ab2da381fe313dbe8e00a62b2542 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 22 Sep 2025 21:58:25 -0700 Subject: [PATCH 11/26] Update GHA workflows from Java 24 to Java 25. --- .github/workflows/auto-jdk-matrix.yml | 2 +- .github/workflows/auto-os-matrix.yml | 2 +- .github/workflows/check_cpp_files.yml | 2 +- .github/workflows/codeql-analysis.yml | 2 +- .github/workflows/javadoc.yml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/auto-jdk-matrix.yml b/.github/workflows/auto-jdk-matrix.yml index f5ba78463..6e71e53b4 100644 --- a/.github/workflows/auto-jdk-matrix.yml +++ b/.github/workflows/auto-jdk-matrix.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - jdk: [ 24 ] + jdk: [ 25 ] env: JDK_VERSION: ${{ matrix.jdk }} diff --git a/.github/workflows/auto-os-matrix.yml b/.github/workflows/auto-os-matrix.yml index abb9baec3..3ee7076b4 100644 --- a/.github/workflows/auto-os-matrix.yml +++ b/.github/workflows/auto-os-matrix.yml @@ -21,7 +21,7 @@ jobs: fail-fast: false matrix: - jdk: [ 24 ] + jdk: [ 25 ] os: [ windows-latest, ubuntu-latest, macos-latest ] include: - os: windows-latest diff --git a/.github/workflows/check_cpp_files.yml b/.github/workflows/check_cpp_files.yml index e433dcb87..42e64a099 100644 --- a/.github/workflows/check_cpp_files.yml +++ b/.github/workflows/check_cpp_files.yml @@ -27,7 +27,7 @@ jobs: - name: Setup Java uses: actions/setup-java@v4 with: - java-version: '24' + java-version: '25' distribution: 'temurin' - name: Configure C++ build diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index d8114578a..f476a2b7f 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -35,7 +35,7 @@ jobs: with: distribution: 'temurin' cache: 'maven' - java-version: '24' + java-version: '25' - name: Initialize CodeQL uses: github/codeql-action/init@v3 diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 2fef93616..b7e7289c6 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -19,7 +19,7 @@ jobs: - name: Setup Java uses: actions/setup-java@v4 with: - java-version: '24' + java-version: '25' distribution: 'temurin' - name: Echo Java Version From 8ab3c35ca1b7139c7e6826c2cf5a6f89b09bc8f3 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Mon, 22 Sep 2025 22:04:25 -0700 Subject: [PATCH 12/26] Disable GHA Workflows until they support Java 25. --- .github/workflows/auto-jdk-matrix.yml | 12 ++++++------ .github/workflows/auto-os-matrix.yml | 12 ++++++------ .github/workflows/check_cpp_files.yml | 12 ++++++------ .github/workflows/codeql-analysis.yml | 12 ++++++------ .github/workflows/javadoc.yml | 4 ++-- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/.github/workflows/auto-jdk-matrix.yml b/.github/workflows/auto-jdk-matrix.yml index 6e71e53b4..11281bbdb 100644 --- a/.github/workflows/auto-jdk-matrix.yml +++ b/.github/workflows/auto-jdk-matrix.yml @@ -1,12 +1,12 @@ name: Auto JDK Matrix Test & Install on: - push: - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# push: +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# The branches below must be a subset of the branches above +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: diff --git a/.github/workflows/auto-os-matrix.yml b/.github/workflows/auto-os-matrix.yml index 3ee7076b4..13caa578b 100644 --- a/.github/workflows/auto-os-matrix.yml +++ b/.github/workflows/auto-os-matrix.yml @@ -1,13 +1,13 @@ name: Auto OS Matrix Test & Install on: - push: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# push: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: diff --git a/.github/workflows/check_cpp_files.yml b/.github/workflows/check_cpp_files.yml index 42e64a099..44aff2f83 100644 --- a/.github/workflows/check_cpp_files.yml +++ b/.github/workflows/check_cpp_files.yml @@ -1,13 +1,13 @@ name: CPP SerDe Compatibility Test on: - push: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# push: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index f476a2b7f..b6c723f7c 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -1,13 +1,13 @@ name: "CodeQL" on: - push: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] - pull_request: - paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# push: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# pull_request: +# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] # The branches below must be a subset of the branches above - branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] +# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index b7e7289c6..1a98d91b8 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -1,8 +1,8 @@ name: JavaDoc on: - push: - branches: main +# push: +# branches: main workflow_dispatch: permissions: From 1401350061c8011f238019521b6ac70825139361 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 23 Sep 2025 15:29:27 -0700 Subject: [PATCH 13/26] Updated MemorySegmentRequestExample to use ConcurrentHashMap. --- .../common/MemorySegmentRequest.java | 6 +- .../common/MemorySegmentRequestExample.java | 73 +++++++++++++++++++ .../common/MemorySegmentRequestExtension.java | 70 ------------------ .../apache/datasketches/kll/KllHelper.java | 2 +- .../quantiles/DirectUpdateDoublesSketch.java | 2 +- .../kll/KllMemorySegmentRequestApp.java | 4 +- ...assicQuantilesMemorySegmentRequestApp.java | 4 +- 7 files changed, 81 insertions(+), 80 deletions(-) create mode 100644 src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java delete mode 100644 src/main/java/org/apache/datasketches/common/MemorySegmentRequestExtension.java diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java b/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java index 5bf3253cd..2fe018335 100644 --- a/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java +++ b/src/main/java/org/apache/datasketches/common/MemorySegmentRequest.java @@ -29,15 +29,13 @@ public interface MemorySegmentRequest { /** - * Request a new MemorySegment with the given newByteSize. + * Request a new heap MemorySegment with the given newByteSize. * Because we do not have a reference to an Arena, the default here is to * allocate a new MemorySegment on the heap. It is up to the user to override this as appropriate. - * @param prevSeg the previous MemorySegment to be possibly closed here or by using the separate - * {@link #requestClose requestClose} method. This is included for convenience, it may be null. * @param newByteSize The new byteSize being requested. * @return new MemorySegment with the requested byteSize. */ - default MemorySegment request(final MemorySegment prevSeg, final long newByteSize) { + default MemorySegment request(final long newByteSize) { if (newByteSize > Integer.MAX_VALUE) { throw new SketchesArgumentException("Requested size in bytes exceeds Integer.MAX_VALUE."); } diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java new file mode 100644 index 000000000..84912441a --- /dev/null +++ b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.common; + +import java.lang.foreign.Arena; + +import java.lang.foreign.MemorySegment; +import java.util.Enumeration; +import java.util.concurrent.ConcurrentHashMap; + +/** + * This is an example of a possible implementation of the MemorySegmentRequest interface + * where all requested segments are allocated off-heap. A local ConcurrentHashMap tracks a newly created confined Arena + * for every new MemorySegment allocated off-heap. This allows individual segments to be freed + * immediately upon receiving the {@link #requestClose(MemorySegment) requestClose(MemorySegment)} call. + */ +public final class MemorySegmentRequestExample implements MemorySegmentRequest { + private final ConcurrentHashMap map = new ConcurrentHashMap<>(); + + /** + * Request a new off-heap MemorySegment with the given newByteSeze. + * An internal confined Arena is created to exclusively manage the new segment and it is associated + * with the new segment with a ConcurrentHashMap. + */ + @Override + public synchronized MemorySegment request(final long newByteSize) { + final Arena arena = Arena.ofConfined(); + final MemorySegment seg = arena.allocate(newByteSize); + map.put(seg, arena); + return seg; + + } + + @Override + public synchronized void requestClose(final MemorySegment segKey) { + final Arena arena = map.get(segKey); + if (arena == null) { throw new SketchesArgumentException("Given MemorySegment key is not mapped to an Arena!"); } + if (arena.scope().isAlive()) { + arena.close(); + map.remove(segKey); + } + } + + /** + * This cleans up any unclosed, off-heap MemorySegments. + */ + public synchronized void cleanup() { + for (final Enumeration e = map.elements(); e.hasMoreElements();) { + final Arena arena = e.nextElement(); + if (arena.scope().isAlive()) { + arena.close(); + } + } + } + +} diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExtension.java b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExtension.java deleted file mode 100644 index d6d1c4371..000000000 --- a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExtension.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.common; - -import java.lang.foreign.Arena; -import java.lang.foreign.MemorySegment; -import java.util.Enumeration; -import java.util.Hashtable; - -/** - * This is just an example of a possible extension of the MemorySegmentRequest interface. - * You may want to enable the println statements to track the state of the Hashtable. - */ -public final class MemorySegmentRequestExtension implements MemorySegmentRequest { - private final Hashtable table = new Hashtable<>(); - - @Override - public synchronized MemorySegment request(final MemorySegment prevSeg, final long newByteSize) { - if (prevSeg.isNative()) { - final Arena arena = Arena.ofConfined(); - final MemorySegment seg = arena.allocate(newByteSize); - table.put(seg, arena); //System.out.println("Add"); - return seg; - } else { - if (newByteSize > Integer.MAX_VALUE) { - throw new SketchesArgumentException("Requested byteSize is greater than Integer.MAX_VALUE."); - } - return MemorySegment.ofArray(new byte[(int)newByteSize]); - } - } - - @Override - public synchronized void requestClose(final MemorySegment prevSeg) { - final Arena arena = table.get(prevSeg); - if ((arena != null) && arena.scope().isAlive()) { - arena.close(); - table.remove(prevSeg); //System.out.println("Remove"); - } //else ignore - } - - /** - * This cleans up any unclosed off-heap MemorySegments. - */ - public synchronized void cleanup() { - for (final Enumeration e = table.elements(); e.hasMoreElements();) { - final Arena arena = e.nextElement(); - if (arena.scope().isAlive()) { - arena.close(); //System.out.println("Closed a remaining Arena in the Hashtable"); - } - } - } - -} diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java index 3d784972f..73bfb5283 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java @@ -357,7 +357,7 @@ static MemorySegment memorySegmentSpaceMgmt( if (mSegReq == null) { mSegReq = MemorySegmentRequest.DEFAULT; } - final MemorySegment newSeg = mSegReq.request(oldWseg, requiredSketchBytes); + final MemorySegment newSeg = mSegReq.request(requiredSketchBytes); MemorySegment.copy(oldWseg, 0, newSeg, 0, DATA_START_ADR); //copy preamble (first 20 bytes) mSegReq.requestClose(oldWseg); return newSeg; diff --git a/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java b/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java index 4976039d5..168b80b16 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/quantiles/DirectUpdateDoublesSketch.java @@ -358,7 +358,7 @@ private MemorySegment growCombinedSegBuffer(final int itemSpaceNeeded) { mSegReq_ = (mSegReq_ == null) ? MemorySegmentRequest.DEFAULT : mSegReq_; - final MemorySegment newSeg = mSegReq_.request(seg_, needBytes); + final MemorySegment newSeg = mSegReq_.request(needBytes); MemorySegment.copy(seg_, 0, newSeg, 0, segBytes); mSegReq_.requestClose(seg_); return newSeg; diff --git a/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java b/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java index 7d875a7e6..ad7b92d86 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java +++ b/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java @@ -27,7 +27,7 @@ import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; -import org.apache.datasketches.common.MemorySegmentRequestExtension; +import org.apache.datasketches.common.MemorySegmentRequestExample; import org.testng.annotations.Test; @@ -49,7 +49,7 @@ public void checkMemorySegmentRequestExtension() { final MemorySegment seg = arena.allocate(numBytes); //Use the custom extension of the MemorySegmentRequest interface. - final MemorySegmentRequestExtension mSegReqExt = new MemorySegmentRequestExtension(); + final MemorySegmentRequestExample mSegReqExt = new MemorySegmentRequestExample(); //Create a new KllLongsSketch and pass the custom extension final KllLongsSketch sk = KllLongsSketch.newDirectInstance(k, seg, mSegReqExt); diff --git a/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java b/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java index 7556d2be9..74777d0fe 100644 --- a/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java +++ b/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java @@ -25,7 +25,7 @@ import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; -import org.apache.datasketches.common.MemorySegmentRequestExtension; +import org.apache.datasketches.common.MemorySegmentRequestExample; import org.testng.annotations.Test; public class ClassicQuantilesMemorySegmentRequestApp { @@ -47,7 +47,7 @@ public void checkMemorySegmentRequestExtension() { final MemorySegment seg = arena.allocate(initalBytes); //Use the custom extension of the MemorySegmentRequest interface. - final MemorySegmentRequestExtension mSegReqExt = new MemorySegmentRequestExtension(); + final MemorySegmentRequestExample mSegReqExt = new MemorySegmentRequestExample(); //Create a new KllLongsSketch and pass the custom extension final DoublesSketchBuilder bldr = DoublesSketch.builder().setK(k); From eba1d8c9121c131eaba1eaf954b8662f7cb5ebac Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 25 Sep 2025 12:53:05 -0700 Subject: [PATCH 14/26] Refactor MemorySegmentRequest to make concurrent. --- .../common/MemorySegmentRequestExample.java | 23 ++-- .../ConcurrentDirectQuickSelectSketch.java | 13 +- .../theta/DirectQuickSelectSketch.java | 42 ++++-- .../datasketches/theta/PreambleUtil.java | 10 +- .../org/apache/datasketches/theta/Sketch.java | 2 +- .../apache/datasketches/theta/Sketches.java | 10 +- .../apache/datasketches/theta/UnionImpl.java | 12 +- .../datasketches/theta/UpdateSketch.java | 11 +- .../theta/UpdateSketchBuilder.java | 120 +++++++++++------- .../kll/KllMemorySegmentRequestApp.java | 33 ++--- ...assicQuantilesMemorySegmentRequestApp.java | 30 ++--- ...ConcurrentDirectQuickSelectSketchTest.java | 14 +- .../ConcurrentHeapQuickSelectSketchTest.java | 10 +- .../theta/DirectQuickSelectSketchTest.java | 26 ++-- .../datasketches/theta/DirectUnionTest.java | 6 +- .../datasketches/theta/UpdateSketchTest.java | 74 +++++------ 16 files changed, 239 insertions(+), 197 deletions(-) diff --git a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java index 84912441a..650ce4223 100644 --- a/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java +++ b/src/main/java/org/apache/datasketches/common/MemorySegmentRequestExample.java @@ -24,11 +24,12 @@ import java.lang.foreign.MemorySegment; import java.util.Enumeration; import java.util.concurrent.ConcurrentHashMap; +import java.util.Objects; /** * This is an example of a possible implementation of the MemorySegmentRequest interface - * where all requested segments are allocated off-heap. A local ConcurrentHashMap tracks a newly created confined Arena - * for every new MemorySegment allocated off-heap. This allows individual segments to be freed + * where all requested segments are allocated off-heap. A local ConcurrentHashMap tracks a newly created + * confined Arena for every new MemorySegment allocated off-heap. This allows individual segments to be freed * immediately upon receiving the {@link #requestClose(MemorySegment) requestClose(MemorySegment)} call. */ public final class MemorySegmentRequestExample implements MemorySegmentRequest { @@ -50,24 +51,26 @@ public synchronized MemorySegment request(final long newByteSize) { @Override public synchronized void requestClose(final MemorySegment segKey) { + Objects.requireNonNull(segKey, "MemorySegment segKey must not be null"); final Arena arena = map.get(segKey); - if (arena == null) { throw new SketchesArgumentException("Given MemorySegment key is not mapped to an Arena!"); } - if (arena.scope().isAlive()) { - arena.close(); + if (arena != null) { + if (arena.scope().isAlive()) { arena.close(); } map.remove(segKey); + } else { + //ignore or + //throw new SketchesArgumentException("Given MemorySegment key is not mapped to an Arena!"); } } /** - * This cleans up any unclosed, off-heap MemorySegments. + * This closes any unclosed, off-heap MemorySegments and removes all mappings from the map. */ public synchronized void cleanup() { - for (final Enumeration e = map.elements(); e.hasMoreElements();) { + for (final Enumeration e = map.elements(); e.hasMoreElements(); ) { final Arena arena = e.nextElement(); - if (arena.scope().isAlive()) { - arena.close(); - } + if (arena.scope().isAlive()) { arena.close(); } } + map.clear(); } } diff --git a/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java index 6e7cad3c5..b70fdda36 100644 --- a/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketch.java @@ -75,7 +75,9 @@ final class ConcurrentDirectQuickSelectSketch extends DirectQuickSelectSketch final double maxConcurrencyError, final MemorySegment dstSeg) { super(lgNomLongs, seed, 1.0F, //p ResizeFactor.X1, //rf, - dstSeg, false); //unionGadget + dstSeg, + null, + false); //unionGadget volatileThetaLong_ = Long.MAX_VALUE; volatileEstimate_ = 0; @@ -91,6 +93,7 @@ final class ConcurrentDirectQuickSelectSketch extends DirectQuickSelectSketch super(sketch.getLgNomLongs(), seed, 1.0F, //p ResizeFactor.X1, //rf, dstSeg, + null, false); //unionGadget exactLimit_ = ConcurrentSharedThetaSketch.computeExactLimit(1L << getLgNomLongs(), @@ -115,7 +118,7 @@ public double getEstimate() { @Override public boolean isEstimationMode() { - return (getRetainedEntries(false) > exactLimit_) || super.isEstimationMode(); + return getRetainedEntries(false) > exactLimit_ || super.isEstimationMode(); } @Override @@ -164,7 +167,7 @@ public long getExactLimit() { @Override public boolean startEagerPropagation() { while (!sharedPropagationInProgress_.compareAndSet(false, true)) { /* busy wait till free */ } - return (!isEstimationMode());// no eager propagation is allowed in estimation mode + return !isEstimationMode();// no eager propagation is allowed in estimation mode } @Override @@ -206,8 +209,8 @@ public void initBgPropagationService() { public boolean propagate(final AtomicBoolean localPropagationInProgress, final Sketch sketchIn, final long singleHash) { final long epoch = epoch_; - if ((singleHash != NOT_SINGLE_HASH) // namely, is a single hash and - && (getRetainedEntries(false) < exactLimit_)) { // a small sketch then propagate myself (blocking) + if (singleHash != NOT_SINGLE_HASH // namely, is a single hash and + && getRetainedEntries(false) < exactLimit_) { // a small sketch then propagate myself (blocking) if (!startEagerPropagation()) { endPropagation(localPropagationInProgress, true); return false; diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 7a0d16cc6..5980f7bbb 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -60,6 +60,7 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SuppressFBWarnings; @@ -81,15 +82,19 @@ class DirectQuickSelectSketch extends DirectQuickSelectSketchR { private static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space int hashTableThreshold_; //computed and mutable, kept only on heap, never serialized. + private final MemorySegmentRequest mSegReq; /** - * Construct this sketch as a result of a wrap operation where the given MemorySegment already has a sketch image. - * @param wseg the given MemorySegment that has a sketch image. + * Construct this sketch as a result of a wrap operation where the given MemorySegment already has an updatable sketch image. + * @param wseg the given MemorySegment that has an updatable sketch image. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param seed See Update Hash Seed. */ private DirectQuickSelectSketch( final MemorySegment wseg, + final MemorySegmentRequest mSegReq, final long seed) { + this.mSegReq = mSegReq == null ? MemorySegmentRequest.DEFAULT : mSegReq; super(wseg, seed); } @@ -105,6 +110,7 @@ private DirectQuickSelectSketch( * See Resize Factor * @param dstSeg the given MemorySegment object destination. It cannot be null. * It will be cleared prior to use. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param unionGadget true if this sketch is implementing the Union gadget function. * Otherwise, it is behaving as a normal QuickSelectSketch. */ @@ -114,6 +120,7 @@ private DirectQuickSelectSketch( final float p, final ResizeFactor rf, final MemorySegment dstSeg, + final MemorySegmentRequest mSegReq, final boolean unionGadget) { //Choose family, preambleLongs @@ -129,7 +136,7 @@ private DirectQuickSelectSketch( final long curSegCapBytes = dstSeg.byteSize(); if (curSegCapBytes < minReqBytes) { throw new SketchesArgumentException( - "MemorySegment capacity is too small: " + curSegCapBytes + " < " + minReqBytes); + "MemorySegment capacity is less than minimum required: " + curSegCapBytes + " < " + minReqBytes); } //@formatter:off @@ -153,17 +160,22 @@ private DirectQuickSelectSketch( //clear hash table area dstSeg.asSlice(preambleLongs << 3, Long.BYTES << lgArrLongs).fill((byte)0); hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); + this.mSegReq = mSegReq == null ? MemorySegmentRequest.DEFAULT : mSegReq; super(dstSeg, seed); } /** - * Wrap a sketch around the given source MemorySegment containing sketch data that originated from - * this sketch. + * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg The given MemorySegment object must be in hash table form and not read only. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param seed See Update Hash Seed * @return instance of this sketch */ - static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final long seed) { + //called from UnionImpl and UpdateSketch + static DirectQuickSelectSketch writableWrap( + final MemorySegment srcSeg, + final MemorySegmentRequest mSegReq, + final long seed) { final int preambleLongs = extractPreLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 @@ -176,7 +188,7 @@ static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final lo insertLgResizeFactor(srcSeg, ResizeFactor.X2.lg()); } - final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, seed); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, mSegReq, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -185,14 +197,19 @@ static DirectQuickSelectSketch writableWrap(final MemorySegment srcSeg, final lo * Fast-wrap a sketch around the given source MemorySegment containing sketch data that originated from * this sketch. This does NO validity checking of the given MemorySegment. * @param srcSeg The given MemorySegment must be in hash table form and not read only. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param seed See Update Hash Seed * @return instance of this sketch */ - static DirectQuickSelectSketch fastWritableWrap(final MemorySegment srcSeg, final long seed) { + //called from UnionImpl <- Union + static DirectQuickSelectSketch fastWritableWrap( + final MemorySegment srcSeg, + final MemorySegmentRequest mSegReq, + final long seed) { final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, seed); + final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(srcSeg, mSegReq, seed); dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -205,7 +222,7 @@ static DirectQuickSelectSketch fastWritableWrap(final MemorySegment srcSeg, fina public UpdateSketch rebuild() { final int lgNomLongs = getLgNomLongs(); final int preambleLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - if (getRetainedEntries(true) > (1 << lgNomLongs)) { + if (getRetainedEntries(true) > 1 << lgNomLongs) { quickSelectAndRebuild(wseg_, preambleLongs, lgNomLongs); } return this; @@ -279,10 +296,13 @@ UpdateReturnState hashUpdate(final long hash) { tgtLgArrLongs = Math.min(lgArrLongs + lgRF, lgNomLongs + 1); final int tgtArrBytes = 8 << tgtLgArrLongs; final int reqBytes = tgtArrBytes + preBytes; - final MemorySegment newDstSeg = MemorySegment.ofArray(new byte[reqBytes]); //always on-heap //TODO ADD MemSegReq + + final MemorySegment newDstSeg = mSegReq.request(reqBytes); moveAndResize(wseg_, preambleLongs, lgArrLongs, newDstSeg, tgtLgArrLongs, thetaLong); + final MemorySegment oldSeg = wseg_; wseg_ = newDstSeg; + mSegReq.requestClose(oldSeg); hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, tgtLgArrLongs); return InsertedCountIncrementedResized; diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index 296fc7d5e..294682e04 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -272,7 +272,7 @@ static String preambleToString(final MemorySegment seg) { //Flags final int flags = extractFlags(seg); - final String flagsStr = (flags) + ", 0x" + (Integer.toHexString(flags)) + ", " + final String flagsStr = flags + ", 0x" + Integer.toHexString(flags) + ", " + zeroPad(Integer.toBinaryString(flags), 8); final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; final boolean empty = (flags & EMPTY_FLAG_MASK) > 0; @@ -377,11 +377,11 @@ else if (preLongs == 3) { //@formatter:on static int extractPreLongs(final MemorySegment seg) { - return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; + return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //for SerVer 1,2,3 } static int extractLgResizeFactor(final MemorySegment seg) { - return (seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT) & 0X3; + return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; } static int extractLgResizeRatioV1(final MemorySegment seg) { @@ -463,7 +463,7 @@ static void insertLgResizeFactor(final MemorySegment seg, final int rf) { final int curByte = seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0xFF; final int shift = LG_RESIZE_FACTOR_BIT; // shift in bits final int mask = 3; - final byte newByte = (byte) (((rf & mask) << shift) | (~(mask << shift) & curByte)); + final byte newByte = (byte) ((rf & mask) << shift | ~(mask << shift) & curByte); seg.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, newByte); } @@ -520,7 +520,7 @@ static void clearEmpty(final MemorySegment seg) { } static boolean isEmptyFlag(final MemorySegment seg) { - return ((extractFlags(seg) & EMPTY_FLAG_MASK) > 0); + return (extractFlags(seg) & EMPTY_FLAG_MASK) > 0; } /** diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 5c14b8fda..8ff9c13d0 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -340,7 +340,7 @@ public static int getMaxUpdateSketchBytes(final int nomEntries) { * @return the maximum number of storage bytes required for a UpdateSketch with the given lgNomEntries */ public static int getUpdateSketchMaxBytes(final int lgNomEntries) { - return (1 << lgNomEntries << 4) + (Family.QUICKSELECT.getMaxPreLongs() << 3); + return (16 << lgNomEntries) + (Family.QUICKSELECT.getMaxPreLongs() << 3); } /** diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java index 40c7ccf86..69d945c40 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ b/src/main/java/org/apache/datasketches/theta/Sketches.java @@ -355,12 +355,12 @@ public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg) { * @return {@link UpdateSketch UpdateSketch} */ public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.wrap(srcSeg, expectedSeed); + return UpdateSketch.wrap(srcSeg, null, expectedSeed); } //Restricted static methods - static void checkIfValidThetaSketch(final MemorySegment srcSeg) { + private static void checkIfValidThetaSketch(final MemorySegment srcSeg) { final int fam = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); if (!Sketch.isValidSketchID(fam)) { throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " @@ -371,7 +371,7 @@ static void checkIfValidThetaSketch(final MemorySegment srcSeg) { static boolean getEmpty(final MemorySegment srcSeg) { final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); if (serVer == 1) { - return ((getThetaLong(srcSeg) == Long.MAX_VALUE) && (getRetainedEntries(srcSeg) == 0)); + return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; } return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 } @@ -384,7 +384,7 @@ static int getRetainedEntries(final MemorySegment srcSeg) { final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); if (serVer == 1) { final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); - if ((getThetaLong(srcSeg) == Long.MAX_VALUE) && (entries == 0)) { + if (getThetaLong(srcSeg) == Long.MAX_VALUE && entries == 0) { return 0; } return entries; @@ -401,6 +401,6 @@ static int getRetainedEntries(final MemorySegment srcSeg) { static long getThetaLong(final MemorySegment srcSeg) { final int preLongs = getPreambleLongs(srcSeg); - return (preLongs < 3) ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3 + return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3 } } diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java index ed0178c8c..d921ec1ba 100644 --- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java @@ -105,7 +105,7 @@ static UnionImpl initNewDirectInstance( final ResizeFactor rf, final MemorySegment dstSeg) { final UpdateSketch gadget = //create with UNION family - new DirectQuickSelectSketch(lgNomLongs, seed, p, rf, dstSeg, true); + new DirectQuickSelectSketch(lgNomLongs, seed, p, rf, dstSeg, null, true); final UnionImpl unionImpl = new UnionImpl(gadget, seed); unionImpl.unionThetaLong_ = gadget.getThetaLong(); unionImpl.unionEmpty_ = gadget.isEmpty(); @@ -142,7 +142,7 @@ static UnionImpl fastWrapInstance(final MemorySegment srcSeg, final long expecte Family.UNION.checkFamilyID(extractFamilyID(srcSeg)); final UpdateSketch gadget = srcSeg.isReadOnly() ? DirectQuickSelectSketchR.fastReadOnlyWrap(srcSeg, expectedSeed) - : DirectQuickSelectSketch.fastWritableWrap(srcSeg, expectedSeed); + : DirectQuickSelectSketch.fastWritableWrap(srcSeg, null, expectedSeed); final UnionImpl unionImpl = new UnionImpl(gadget, expectedSeed); unionImpl.unionThetaLong_ = extractUnionThetaLong(srcSeg); unionImpl.unionEmpty_ = PreambleUtil.isEmptyFlag(srcSeg); @@ -151,17 +151,17 @@ static UnionImpl fastWrapInstance(final MemorySegment srcSeg, final long expecte /** * Wrap a Union object around a Union MemorySegment object containing data. - * Called by SetOperation. * @param srcSeg The source MemorySegment object. * @param expectedSeed the seed used to validate the given MemorySegment image. * See seed * @return this class */ + //Called by SetOperation and Union static UnionImpl wrapInstance(final MemorySegment srcSeg, final long expectedSeed) { Family.UNION.checkFamilyID(extractFamilyID(srcSeg)); final UpdateSketch gadget = srcSeg.isReadOnly() ? DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed) - : DirectQuickSelectSketch.writableWrap(srcSeg, expectedSeed); + : DirectQuickSelectSketch.writableWrap(srcSeg, null, expectedSeed); final UnionImpl unionImpl = new UnionImpl(gadget, expectedSeed); unionImpl.unionThetaLong_ = extractUnionThetaLong(srcSeg); unionImpl.unionEmpty_ = PreambleUtil.isEmptyFlag(srcSeg); @@ -269,7 +269,7 @@ public CompactSketch union(final Sketch sketchA, final Sketch sketchB, final boo public void union(final Sketch sketchIn) { //UNION Empty Rule: AND the empty states. - if ((sketchIn == null) || sketchIn.isEmpty()) { + if (sketchIn == null || sketchIn.isEmpty()) { //null and empty is interpreted as (Theta = 1.0, count = 0, empty = T). Nothing changes return; } @@ -287,7 +287,7 @@ public void union(final Sketch sketchIn) { final HashIterator it = sketchIn.iterator(); while (it.next()) { final long hash = it.get(); - if ((hash < unionThetaLong_) && (hash < gadget_.getThetaLong())) { + if (hash < unionThetaLong_ && hash < gadget_.getThetaLong()) { gadget_.hashUpdate(hash); // backdoor update, hash function is bypassed } else if (isOrdered) { break; } } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index f080e976a..8fe93e46c 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -47,6 +47,7 @@ import java.util.Objects; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; @@ -80,7 +81,7 @@ public abstract class UpdateSketch extends Sketch { * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch wrap(final MemorySegment srcWSeg) { - return wrap(srcWSeg, Util.DEFAULT_UPDATE_SEED); + return wrap(srcWSeg, null, Util.DEFAULT_UPDATE_SEED); } /** @@ -91,6 +92,7 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg) { * Java Heap version of the sketch where all data will be copied to the heap. * @param srcWSeg an image of a writable sketch where the image seed hash matches the given seed hash. * It must have a size of at least 24 bytes. + * @param mSegReq an implementation of the MemorySegmentRequest interface or null. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * Compact sketches store a 16-bit hash of the seed, but not the seed itself. @@ -99,7 +101,10 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg) { * is invalid, corrupted, or incompatible with this sketch type. * Callers must treat this as a fatal error for that segment. */ - public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expectedSeed) { + public static UpdateSketch wrap( + final MemorySegment srcWSeg, + final MemorySegmentRequest mSegReq, + final long expectedSeed) { Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null"); checkBounds(0, 24, srcWSeg.byteSize()); //need min 24 bytes final int preLongs = srcWSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits @@ -111,7 +116,7 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expected "A " + family + " sketch cannot be wrapped as an UpdateSketch."); } if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketch.writableWrap(srcWSeg, expectedSeed); + return DirectQuickSelectSketch.writableWrap(srcWSeg, mSegReq, expectedSeed); } else { throw new SketchesArgumentException( "Corrupted: An UpdateSketch image must have SerVer = 3 and preLongs = 3"); diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java index 6c8d5f37f..d91d654b6 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java @@ -26,6 +26,7 @@ import java.lang.foreign.MemorySegment; import org.apache.datasketches.common.Family; +import org.apache.datasketches.common.MemorySegmentRequest; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; @@ -44,10 +45,11 @@ public final class UpdateSketchBuilder { private ResizeFactor bRF; private Family bFam; private float bP; + private MemorySegmentRequest bMemorySegmentRequest; //Fields for concurrent theta sketch private int bNumPoolThreads; - private int bLocalLgNomLongs; + private int bConCurLgNomLongs; private boolean bPropagateOrderedCompact; private double bMaxConcurrencyError; private int bMaxNumLocalThreads; @@ -57,11 +59,12 @@ public final class UpdateSketchBuilder { *
    *
  • Nominal Entries: {@value org.apache.datasketches.thetacommon.ThetaUtil#DEFAULT_NOMINAL_ENTRIES}
  • *
  • Seed: {@value org.apache.datasketches.common.Util#DEFAULT_UPDATE_SEED}
  • - *
  • Input Sampling Probability: 1.0
  • - *
  • Family: {@link org.apache.datasketches.common.Family#QUICKSELECT}
  • *
  • Resize Factor: The default for sketches on the Java heap is {@link ResizeFactor#X8}. * For direct sketches, which are targeted for off-heap, this value will * be fixed at either {@link ResizeFactor#X1} or {@link ResizeFactor#X2}.
  • + *
  • Family: {@link org.apache.datasketches.common.Family#QUICKSELECT}
  • + *
  • Input Sampling Probability, p: 1.0
  • + *
  • MemorySegmentRequest implementation: null
  • *
* Parameters unique to the concurrent sketches only: *
    @@ -75,19 +78,21 @@ public final class UpdateSketchBuilder { public UpdateSketchBuilder() { bLgNomLongs = Integer.numberOfTrailingZeros(ThetaUtil.DEFAULT_NOMINAL_ENTRIES); bSeed = Util.DEFAULT_UPDATE_SEED; - bP = (float) 1.0; bRF = ResizeFactor.X8; bFam = Family.QUICKSELECT; + bP = (float) 1.0; + bMemorySegmentRequest = null; + // Default values for concurrent sketch bNumPoolThreads = ConcurrentPropagationService.NUM_POOL_THREADS; - bLocalLgNomLongs = 4; //default is smallest legal QS sketch + bConCurLgNomLongs = 4; //default is smallest legal QS sketch bPropagateOrderedCompact = true; bMaxConcurrencyError = 0; bMaxNumLocalThreads = 1; } /** - * Sets the Nominal Entries for this sketch. + * Sets the local Nominal Entries for this builder. * This value is also used for building a shared concurrent sketch. * The minimum value is 16 (2^4) and the maximum value is 67,108,864 (2^26). * Be aware that sketches as large as this maximum value may not have been @@ -103,7 +108,7 @@ public UpdateSketchBuilder setNominalEntries(final int nomEntries) { } /** - * Alternative method of setting the Nominal Entries for this sketch from the log_base2 value. + * Alternative method of setting the local Nominal Entries for this builder from the log_base2 value. * This value is also used for building a shared concurrent sketch. * The minimum value is 4 and the maximum value is 26. * Be aware that sketches as large as this maximum value may not have been @@ -118,14 +123,13 @@ public UpdateSketchBuilder setLogNominalEntries(final int lgNomEntries) { } /** - * Alternative method of setting the Nominal Entries for this sketch from the log_base2 value, - * commonly called LgK. + * Alternative method of setting the Nominal Entries for this builder from the log_base2 value, commonly called LgK. * This value is also used for building a shared concurrent sketch. * The minimum value is 4 and the maximum value is 26. * Be aware that sketches as large as 26 may not have been * thoroughly characterized for performance. * - * @param lgK the Log Nominal Entries. Also for the concurrent shared sketch + * @param lgK the Log Nominal Entries. Also for the concurrent shared sketch. * @return this UpdateSketchBuilder */ public UpdateSketchBuilder setLgK(final int lgK) { @@ -134,7 +138,7 @@ public UpdateSketchBuilder setLgK(final int lgK) { } /** - * Returns Log-base 2 Nominal Entries + * Returns the local Log-base 2 Nominal Entries * @return Log-base 2 Nominal Entries */ public int getLgNominalEntries() { @@ -142,7 +146,7 @@ public int getLgNominalEntries() { } /** - * Sets the Nominal Entries for the concurrent local sketch. The minimum value is 16 and the + * Sets the local (default) Concurrent Nominal Entries for the concurrent local sketch. The minimum value is 16 and the * maximum value is 67,108,864, which is 2^26. * Be aware that sketches as large as this maximum * value have not been thoroughly tested or characterized for performance. @@ -151,9 +155,9 @@ public int getLgNominalEntries() { * This will become the ceiling power of 2 if it is not. * @return this UpdateSketchBuilder */ - public UpdateSketchBuilder setLocalNominalEntries(final int nomEntries) { - bLocalLgNomLongs = Integer.numberOfTrailingZeros(ceilingPowerOf2(nomEntries)); - if ((bLocalLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS) || (bLocalLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS)) { + public UpdateSketchBuilder setConCurNominalEntries(final int nomEntries) { + bConCurLgNomLongs = Integer.numberOfTrailingZeros(ceilingPowerOf2(nomEntries)); + if (bConCurLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS || bConCurLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS) { throw new SketchesArgumentException( "Nominal Entries must be >= 16 and <= 67108864: " + nomEntries); } @@ -161,8 +165,7 @@ public UpdateSketchBuilder setLocalNominalEntries(final int nomEntries) { } /** - * Alternative method of setting the Nominal Entries for a local concurrent sketch from the - * log_base2 value. + * Alternative method of setting the local (default) Nominal Entries for a local concurrent sketch from the log_base2 value. * The minimum value is 4 and the maximum value is 26. * Be aware that sketches as large as this maximum * value have not been thoroughly tested or characterized for performance. @@ -170,9 +173,9 @@ public UpdateSketchBuilder setLocalNominalEntries(final int nomEntries) { * @param lgNomEntries the Log Nominal Entries for a concurrent local sketch * @return this UpdateSketchBuilder */ - public UpdateSketchBuilder setLocalLogNominalEntries(final int lgNomEntries) { - bLocalLgNomLongs = lgNomEntries; - if ((bLocalLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS) || (bLocalLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS)) { + public UpdateSketchBuilder setConCurLogNominalEntries(final int lgNomEntries) { + bConCurLgNomLongs = lgNomEntries; + if (bConCurLgNomLongs > ThetaUtil.MAX_LG_NOM_LONGS || bConCurLgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS) { throw new SketchesArgumentException( "Log Nominal Entries must be >= 4 and <= 26: " + lgNomEntries); } @@ -180,15 +183,15 @@ public UpdateSketchBuilder setLocalLogNominalEntries(final int lgNomEntries) { } /** - * Returns Log-base 2 Nominal Entries for the concurrent local sketch + * Returns local Log-base 2 Nominal Entries for the concurrent local sketch * @return Log-base 2 Nominal Entries for the concurrent local sketch */ - public int getLocalLgNominalEntries() { - return bLocalLgNomLongs; + public int getConCurLgNominalEntries() { + return bConCurLgNomLongs; } /** - * Sets the long seed value that is required by the hashing function. + * Sets the local long seed value that is required by the hashing function. * @param seed See seed * @return this UpdateSketchBuilder */ @@ -198,7 +201,7 @@ public UpdateSketchBuilder setSeed(final long seed) { } /** - * Returns the seed + * Returns the local long seed value that is required by the hashing function. * @return the seed */ public long getSeed() { @@ -206,12 +209,12 @@ public long getSeed() { } /** - * Sets the upfront uniform sampling probability, p + * Sets the local upfront uniform pre-sampling probability, p * @param p See Sampling Probability, p * @return this UpdateSketchBuilder */ public UpdateSketchBuilder setP(final float p) { - if ((p <= 0.0) || (p > 1.0)) { + if (p <= 0.0 || p > 1.0) { throw new SketchesArgumentException("p must be > 0 and <= 1.0: " + p); } bP = p; @@ -219,7 +222,7 @@ public UpdateSketchBuilder setP(final float p) { } /** - * Returns the pre-sampling probability p + * Returns the local upfront uniform pre-sampling probability p * @return the pre-sampling probability p */ public float getP() { @@ -227,7 +230,7 @@ public float getP() { } /** - * Sets the cache Resize Factor. + * Sets the local cache Resize Factor. * @param rf See Resize Factor * @return this UpdateSketchBuilder */ @@ -237,7 +240,7 @@ public UpdateSketchBuilder setResizeFactor(final ResizeFactor rf) { } /** - * Returns the Resize Factor + * Returns the local Resize Factor * @return the Resize Factor */ public ResizeFactor getResizeFactor() { @@ -245,7 +248,7 @@ public ResizeFactor getResizeFactor() { } /** - * Set the Family. + * Set the local Family. Choose either Family.ALPHA or Family.QUICKSELECT. * @param family the family for this builder * @return this UpdateSketchBuilder */ @@ -255,7 +258,7 @@ public UpdateSketchBuilder setFamily(final Family family) { } /** - * Returns the Family + * Returns the local Family * @return the Family */ public Family getFamily() { @@ -263,7 +266,27 @@ public Family getFamily() { } /** - * Sets the number of pool threads used for background propagation in the concurrent sketches. + * Sets the local MemorySegmentRequest + * @param mSegReq the given MemorySegmentRequest + * @return this UpdateSketchBuilder + */ + public UpdateSketchBuilder setMemorySegmentRequest(final MemorySegmentRequest mSegReq) { + bMemorySegmentRequest = mSegReq; + return this; + } + + /** + * Returns the local MemorySegmentRequest + * @return the local MemorySegmentRequest + */ + public MemorySegmentRequest getMemorySegmentRequest() { + return bMemorySegmentRequest; + } + + //Concurrent related + + /** + * Sets the local number of pool threads used for background propagation in the concurrent sketches. * @param numPoolThreads the given number of pool threads */ public void setNumPoolThreads(final int numPoolThreads) { @@ -271,7 +294,7 @@ public void setNumPoolThreads(final int numPoolThreads) { } /** - * Gets the number of background pool threads used for propagation in the concurrent sketches. + * Gets the local number of background pool threads used for propagation in the concurrent sketches. * @return the number of background pool threads */ public int getNumPoolThreads() { @@ -279,7 +302,7 @@ public int getNumPoolThreads() { } /** - * Sets the Propagate Ordered Compact flag to the given value. Used with concurrent sketches. + * Sets the local Propagate Ordered Compact flag to the given value. Used with concurrent sketches. * * @param prop the given value * @return this UpdateSketchBuilder @@ -290,7 +313,7 @@ public UpdateSketchBuilder setPropagateOrderedCompact(final boolean prop) { } /** - * Gets the Propagate Ordered Compact flag used with concurrent sketches. + * Gets the local Propagate Ordered Compact flag used with concurrent sketches. * @return the Propagate Ordered Compact flag */ public boolean getPropagateOrderedCompact() { @@ -298,7 +321,7 @@ public boolean getPropagateOrderedCompact() { } /** - * Sets the Maximum Concurrency Error. + * Sets the local Maximum Concurrency Error. * @param maxConcurrencyError the given Maximum Concurrency Error. */ public void setMaxConcurrencyError(final double maxConcurrencyError) { @@ -306,7 +329,7 @@ public void setMaxConcurrencyError(final double maxConcurrencyError) { } /** - * Gets the Maximum Concurrency Error + * Gets the local Maximum Concurrency Error * @return the Maximum Concurrency Error */ public double getMaxConcurrencyError() { @@ -314,7 +337,7 @@ public double getMaxConcurrencyError() { } /** - * Sets the Maximum Number of Local Threads. + * Sets the local Maximum Number of Local Threads. * This is used to set the size of the local concurrent buffers. * @param maxNumLocalThreads the given Maximum Number of Local Threads */ @@ -323,7 +346,7 @@ public void setMaxNumLocalThreads(final int maxNumLocalThreads) { } /** - * Gets the Maximum Number of Local Threads. + * Gets the local Maximum Number of Local Threads. * @return the Maximum Number of Local Threads. */ public int getMaxNumLocalThreads() { @@ -343,12 +366,14 @@ public UpdateSketch build() { /** * Returns an UpdateSketch with the current configuration of this Builder * with the specified backing destination MemorySegment store. - * Note: this cannot be used with the Alpha Family of sketches. + * Note: this can only be used with the QUICKSELECT Family of sketches + * and cannot be used with the Alpha Family of sketches. * @param dstSeg The destination MemorySegment. * @return an UpdateSketch */ public UpdateSketch build(final MemorySegment dstSeg) { UpdateSketch sketch = null; + final boolean unionGadget = false; switch (bFam) { case ALPHA: { if (dstSeg == null) { @@ -361,11 +386,10 @@ public UpdateSketch build(final MemorySegment dstSeg) { } case QUICKSELECT: { if (dstSeg == null) { - sketch = new HeapQuickSelectSketch(bLgNomLongs, bSeed, bP, bRF, false); + sketch = new HeapQuickSelectSketch(bLgNomLongs, bSeed, bP, bRF, unionGadget); } else { - sketch = new DirectQuickSelectSketch( - bLgNomLongs, bSeed, bP, bRF, dstSeg, false); + sketch = new DirectQuickSelectSketch(bLgNomLongs, bSeed, bP, bRF, dstSeg, bMemorySegmentRequest, unionGadget); } break; } @@ -480,10 +504,10 @@ public UpdateSketch buildSharedFromSketch(final UpdateSketch sketch, final Memor * @return an UpdateSketch to be used as a per-thread local buffer. */ public UpdateSketch buildLocal(final UpdateSketch shared) { - if ((shared == null) || !(shared instanceof ConcurrentSharedThetaSketch)) { + if (shared == null || !(shared instanceof ConcurrentSharedThetaSketch)) { throw new SketchesStateException("The concurrent shared sketch must be built first."); } - return new ConcurrentHeapThetaBuffer(bLocalLgNomLongs, bSeed, + return new ConcurrentHeapThetaBuffer(bConCurLgNomLongs, bSeed, (ConcurrentSharedThetaSketch) shared, bPropagateOrderedCompact, bMaxNumLocalThreads); } @@ -493,8 +517,8 @@ public String toString() { sb.append("UpdateSketchBuilder configuration:").append(LS); sb.append("LgK:").append(TAB).append(bLgNomLongs).append(LS); sb.append("K:").append(TAB).append(1 << bLgNomLongs).append(LS); - sb.append("LgLocalK:").append(TAB).append(bLocalLgNomLongs).append(LS); - sb.append("LocalK:").append(TAB).append(1 << bLocalLgNomLongs).append(LS); + sb.append("LgLocalK:").append(TAB).append(bConCurLgNomLongs).append(LS); + sb.append("LocalK:").append(TAB).append(1 << bConCurLgNomLongs).append(LS); sb.append("Seed:").append(TAB).append(bSeed).append(LS); sb.append("p:").append(TAB).append(bP).append(LS); sb.append("ResizeFactor:").append(TAB).append(bRF).append(LS); diff --git a/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java b/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java index ad7b92d86..f0ed246ae 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java +++ b/src/test/java/org/apache/datasketches/kll/KllMemorySegmentRequestApp.java @@ -19,10 +19,8 @@ package org.apache.datasketches.kll; -import static org.apache.datasketches.kll.KllSketch.getMaxSerializedSizeBytes; -import static org.apache.datasketches.kll.KllSketch.SketchType.LONGS_SKETCH; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; @@ -30,7 +28,6 @@ import org.apache.datasketches.common.MemorySegmentRequestExample; import org.testng.annotations.Test; - public class KllMemorySegmentRequestApp { @Test @@ -39,23 +36,24 @@ public class KllMemorySegmentRequestApp { * This demonstrates one example of how to manage a growing off-heap KLL sketch where the * expanded MemorySegments are also off-heap. */ - public void checkMemorySegmentRequestExtension() { + public void checkMemorySegmentRequestExample() { final int k = 200; + final int itemsIn = 10 * k; //will force requests for more space + + //Use the custom MemorySegmentRequestExample to do the allocations. + final MemorySegmentRequestExample mSegReqEx = new MemorySegmentRequestExample(); //The allocation of the original off-heap MemorySegment for the KllLongsSketch //Note that this targets the size to only handle k values, which is quite small. - final int numBytes = getMaxSerializedSizeBytes(k, k, LONGS_SKETCH, true); - final Arena arena = Arena.ofConfined(); - final MemorySegment seg = arena.allocate(numBytes); + final int numBytes = KllSketch.getMaxSerializedSizeBytes(k, k, KllSketch.SketchType.LONGS_SKETCH, true); - //Use the custom extension of the MemorySegmentRequest interface. - final MemorySegmentRequestExample mSegReqExt = new MemorySegmentRequestExample(); + final MemorySegment seg = mSegReqEx.request(numBytes); - //Create a new KllLongsSketch and pass the custom extension - final KllLongsSketch sk = KllLongsSketch.newDirectInstance(k, seg, mSegReqExt); + //Create a new KllLongsSketch and pass the mSegReqEx + final KllLongsSketch sk = KllLongsSketch.newDirectInstance(k, seg, mSegReqEx); //Update the sketch with way more data than the original MemorySegment can handle, forcing it to request larger MemorySegments. - for (int n = 1; n <= (10 * k); n++) { sk.update(n); } + for (int n = 1; n <= itemsIn; n++) { sk.update(n); } //Check to make sure the sketch got all the data: assertEquals(sk.getMaxItem(), 10 * k); @@ -63,13 +61,10 @@ public void checkMemorySegmentRequestExtension() { assertEquals(sk.getN(), 10 * k); //Confirm that the last MemorySegment used by the sketch is, in fact, not the same as the original one that was allocated. - assertFalse(sk.getMemorySegment().equals(seg)); - - //All done with the sketch. Cleanup any unclosed off-heap MemorySegments. - mSegReqExt.cleanup(); + assertTrue(sk.getMemorySegment() != seg); - //Close the original off-heap allocated MemorySegment. - arena.close(); + //All done with the sketch. Cleanup any unclosed off-heap MemorySegments including the original allocation. + mSegReqEx.cleanup(); } } diff --git a/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java b/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java index 74777d0fe..cfd4f61e1 100644 --- a/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java +++ b/src/test/java/org/apache/datasketches/quantiles/ClassicQuantilesMemorySegmentRequestApp.java @@ -20,7 +20,7 @@ package org.apache.datasketches.quantiles; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; @@ -36,27 +36,24 @@ public class ClassicQuantilesMemorySegmentRequestApp { * This demonstrates one example of how to manage a growing off-heap DoublesSketch where the * expanded MemorySegments are also off-heap. */ - public void checkMemorySegmentRequestExtension() { + public void checkMemorySegmentRequestExample() { final int k = 128; //The default is 128 final int itemsIn = 40 * k; //will force requests for more space + //Use the custom MemorySegmentRequestExample to do the allocations. + final MemorySegmentRequestExample mSegReqEx = new MemorySegmentRequestExample(); + //The allocation of the original off-heap MemorySegment for the DoublesSketch //Note that this targets the size to only handle 2k values, which is quite small. final int initalBytes = DoublesSketch.getUpdatableStorageBytes(k, 2 * k); - final Arena arena = Arena.ofConfined(); - final MemorySegment seg = arena.allocate(initalBytes); - //Use the custom extension of the MemorySegmentRequest interface. - final MemorySegmentRequestExample mSegReqExt = new MemorySegmentRequestExample(); + final MemorySegment seg = mSegReqEx.request(initalBytes); - //Create a new KllLongsSketch and pass the custom extension - final DoublesSketchBuilder bldr = DoublesSketch.builder().setK(k); - final DoublesSketch sk = bldr.build(seg, mSegReqExt); + //Create a new KllLongsSketch and pass the mSegReqEx + final DoublesSketch sk = DoublesSketch.builder().setK(k).build(seg, mSegReqEx); //Update the sketch with way more data than the original MemorySegment can handle, forcing it to request larger MemorySegments. - for (int n = 1; n <= itemsIn; n++) { - sk.update(n); - } + for (int n = 1; n <= itemsIn; n++) { sk.update(n); } //Check to make sure the sketch got all the data: assertEquals(sk.getMaxItem(), itemsIn); @@ -64,15 +61,10 @@ public void checkMemorySegmentRequestExtension() { assertEquals(sk.getN(), itemsIn); //Confirm that the last MemorySegment used by the sketch is, in fact, not the same as the original one that was allocated. - assertFalse(sk.getMemorySegment().equals(seg)); + assertTrue(sk.getMemorySegment() != seg); //All done with the sketch. Cleanup any unclosed off-heap MemorySegments. - mSegReqExt.cleanup(); - - //Close the original off-heap allocated MemorySegment. - arena.close(); + mSegReqEx.cleanup(); } - static void println(final Object o) { System.out.println(o.toString()); } - } diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java index 4a59edb97..28f78ac18 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java @@ -242,7 +242,7 @@ public void checkDQStoCompactForms() { assertEquals(csk.getClass().getSimpleName(), "HeapCompactSketch"); final int bytes = shared.getCompactBytes(); - assertEquals(bytes, (k*8) + (Family.COMPACT.getMaxPreLongs() << 3)); + assertEquals(bytes, k*8 + (Family.COMPACT.getMaxPreLongs() << 3)); final byte[] segArr2 = new byte[bytes]; final MemorySegment seg2 = MemorySegment.ofArray(segArr2); @@ -462,7 +462,7 @@ public void checkEstModeMemorySegmentArr() { waitForBgPropagationToComplete(shared); final double est = local.getEstimate(); - assertTrue((est < (u * 1.05)) && (est > (u * 0.95))); + assertTrue(est < u * 1.05 && est > u * 0.95); assertTrue(shared.getRetainedEntries(false) >= k); } @@ -480,7 +480,7 @@ public void checkEstModeNativeMemorySegment() { for (int i = 0; i< u; i++) { local.update(i); } waitForBgPropagationToComplete(shared); final double est = local.getEstimate(); - assertTrue((est < (u * 1.05)) && (est > (u * 0.95))); + assertTrue(est < u * 1.05 && est > u * 0.95); assertTrue(shared.getRetainedEntries(false) >= k); } @@ -501,7 +501,7 @@ public void checkConstructReconstructFromMemorySegment() { final double est1 = local.getEstimate(); final int count1 = shared.getRetainedEntries(false); - assertTrue((est1 < (u * 1.05)) && (est1 > (u * 0.95))); + assertTrue(est1 < u * 1.05 && est1 > u * 0.95); assertTrue(count1 >= k); byte[] serArr; @@ -576,7 +576,7 @@ public void checkBadLgNomLongs() { final boolean useSeg = true; final SharedLocal sl = new SharedLocal(lgK, lgK, useSeg); sl.wseg.set(JAVA_BYTE, LG_NOM_LONGS_BYTE, (byte) 3); //Corrupt LgNomLongs byte - DirectQuickSelectSketch.writableWrap(sl.wseg, Util.DEFAULT_UPDATE_SEED); + DirectQuickSelectSketch.writableWrap(sl.wseg, null, Util.DEFAULT_UPDATE_SEED); } @Test @@ -607,7 +607,7 @@ public void checkBackgroundPropagation() { final long theta2 = ((ConcurrentSharedThetaSketch)shared).getVolatileTheta(); final int entries = shared.getRetainedEntries(false); - assertTrue((entries > k) || (theta2 < theta1), + assertTrue(entries > k || theta2 < theta1, "entries="+entries+" k="+k+" theta1="+theta1+" theta2="+theta2); shared.rebuild(); @@ -658,7 +658,7 @@ public void checkWrapIllegalFamilyID_direct() { sl.wseg.set(JAVA_BYTE, FAMILY_BYTE, (byte) 0); //corrupt the Sketch ID byte //try to wrap the corrupted seg - DirectQuickSelectSketch.writableWrap(sl.wseg, Util.DEFAULT_UPDATE_SEED); + DirectQuickSelectSketch.writableWrap(sl.wseg, null, Util.DEFAULT_UPDATE_SEED); } @Test(expectedExceptions = SketchesArgumentException.class) diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java index 565ef50ed..7ba11c1c9 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java @@ -90,7 +90,7 @@ public void checkPropagationNotOrdered() { final SharedLocal sl = new SharedLocal(lgK, 4, false, false); final UpdateSketch shared = sl.shared; final UpdateSketch local = sl.local; - assertEquals((sl.bldr.getLocalLgNominalEntries()), 4); + assertEquals((sl.bldr.getConCurLgNominalEntries()), 4); assertTrue(local.isEmpty()); for (int i = 0; i < u; i++) { @@ -494,7 +494,7 @@ public void checkRebuild() { public void checkBuilder() { final int lgK = 4; final SharedLocal sl = new SharedLocal(lgK); - assertEquals(sl.bldr.getLocalLgNominalEntries(), lgK); + assertEquals(sl.bldr.getConCurLgNominalEntries(), lgK); assertEquals(sl.bldr.getLgNominalEntries(), lgK); println(sl.bldr.toString()); } @@ -652,11 +652,11 @@ public void checkBuilderExceptions() { fail(); } catch (final SketchesArgumentException e) { } try { - bldr.setLocalNominalEntries(8); + bldr.setConCurNominalEntries(8); fail(); } catch (final SketchesArgumentException e) { } try { - bldr.setLocalLogNominalEntries(3); + bldr.setConCurLogNominalEntries(3); fail(); } catch (final SketchesArgumentException e) { } bldr.setNumPoolThreads(4); @@ -731,7 +731,7 @@ static class SharedLocal { wseg = null; } bldr.setLogNominalEntries(sharedLgK); - bldr.setLocalLogNominalEntries(localLgK); + bldr.setConCurLogNominalEntries(localLgK); bldr.setPropagateOrderedCompact(ordered); bldr.setSeed(this.seed); shared = bldr.buildShared(wseg); diff --git a/src/test/java/org/apache/datasketches/theta/DirectQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/DirectQuickSelectSketchTest.java index f484b5864..329207104 100644 --- a/src/test/java/org/apache/datasketches/theta/DirectQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/DirectQuickSelectSketchTest.java @@ -187,7 +187,7 @@ public void checkWrapIllegalFamilyID_direct() { seg.set(JAVA_BYTE, FAMILY_BYTE, (byte) 0); //corrupt the Sketch ID byte //try to wrap the corrupted seg - DirectQuickSelectSketch.writableWrap(seg, Util.DEFAULT_UPDATE_SEED); + DirectQuickSelectSketch.writableWrap(seg, null, Util.DEFAULT_UPDATE_SEED); } @Test @@ -354,7 +354,7 @@ public void checkDQStoCompactForms() { assertEquals(csk.getClass().getSimpleName(), "HeapCompactSketch"); final int bytes = usk.getCompactBytes(); - assertEquals(bytes, (k*8) + (Family.COMPACT.getMaxPreLongs() << 3)); + assertEquals(bytes, k*8 + (Family.COMPACT.getMaxPreLongs() << 3)); final byte[] segArr2 = new byte[bytes]; final MemorySegment seg2 = MemorySegment.ofArray(segArr2); @@ -767,7 +767,7 @@ public void checkConstructorSrcSegCorruptions() { UpdateSketch usk2; seg1.set(JAVA_BYTE, FAMILY_BYTE, (byte) 3); //corrupt Family by setting to Compact try { - usk2 = DirectQuickSelectSketch.writableWrap(seg1, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg1, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //Pass @@ -775,7 +775,7 @@ public void checkConstructorSrcSegCorruptions() { seg1.set(JAVA_BYTE, FAMILY_BYTE, (byte) 2); //fix Family seg1.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, (byte) 1); //corrupt preLongs try { - usk2 = DirectQuickSelectSketch.writableWrap(seg1, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg1, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -783,7 +783,7 @@ public void checkConstructorSrcSegCorruptions() { seg1.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, (byte) 3); //fix preLongs seg1.set(JAVA_BYTE, SER_VER_BYTE, (byte) 2); //corrupt serVer try { - usk2 = DirectQuickSelectSketch.writableWrap(seg1, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg1, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -793,7 +793,7 @@ public void checkConstructorSrcSegCorruptions() { seg1.set(JAVA_LONG_UNALIGNED, THETA_LONG, Long.MAX_VALUE >>> 1); //corrupt theta and seg1.set(JAVA_BYTE, LG_ARR_LONGS_BYTE, (byte) 10); //corrupt lgArrLongs try { - usk2 = DirectQuickSelectSketch.writableWrap(seg1, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg1, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -803,7 +803,7 @@ public void checkConstructorSrcSegCorruptions() { final byte badFlags = (byte) (COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK | ORDERED_FLAG_MASK); seg1.set(JAVA_BYTE, FLAGS_BYTE, badFlags); try { - usk2 = DirectQuickSelectSketch.writableWrap(seg1, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg1, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -812,7 +812,7 @@ public void checkConstructorSrcSegCorruptions() { final byte[] arr2 = Arrays.copyOfRange(arr1, 0, bytes-1); //corrupt length final MemorySegment seg2 = MemorySegment.ofArray(arr2); try { - usk2 = DirectQuickSelectSketch.writableWrap(seg2, Util.DEFAULT_UPDATE_SEED); + usk2 = DirectQuickSelectSketch.writableWrap(seg2, null, Util.DEFAULT_UPDATE_SEED); fail("Expected SketchesArgumentException"); } catch (final SketchesArgumentException e) { //pass @@ -831,14 +831,14 @@ public void checkCorruptRFWithInsufficientArray() { usk.update(0); insertLgResizeFactor(seg, 0); // corrupt RF: X1 - final UpdateSketch dqss = DirectQuickSelectSketch.writableWrap(seg, Util.DEFAULT_UPDATE_SEED); + final UpdateSketch dqss = DirectQuickSelectSketch.writableWrap(seg, null, Util.DEFAULT_UPDATE_SEED); assertEquals(dqss.getResizeFactor(), ResizeFactor.X2); // force-promote to X2 } @Test public void checkFamilyAndRF() { final int k = 16; - final MemorySegment seg = MemorySegment.ofArray(new byte[(k*16) + 24]); + final MemorySegment seg = MemorySegment.ofArray(new byte[k*16 + 24]); final UpdateSketch sketch = Sketches.updateSketchBuilder().setNominalEntries(k).build(seg); assertEquals(sketch.getFamily(), Family.QUICKSELECT); assertEquals(sketch.getResizeFactor(), ResizeFactor.X8); @@ -849,7 +849,7 @@ public void checkFamilyAndRF() { public void checkResizeInBigSeg() { final int k = 1 << 14; final int u = 1 << 20; - final MemorySegment seg = MemorySegment.ofArray(new byte[(8*k*16) +24]); + final MemorySegment seg = MemorySegment.ofArray(new byte[8*k*16 +24]); final UpdateSketch sketch = Sketches.updateSketchBuilder().setNominalEntries(k).build(seg); for (int i=0; i 0) && !(((lgT - lgA) % lgR) == 0); - boolean rf0 = (lgR == 0) && (lgA != lgT); - assertTrue((lgRbad == rf0) || (lgRbad == rf123)); + final boolean lgRbad = isResizeFactorIncorrect(wseg, lgK, lgA); + final boolean rf123 = lgR > 0 && (lgT - lgA) % lgR != 0; + final boolean rf0 = lgR == 0 && lgA != lgT; + assertTrue(lgRbad == rf0 || lgRbad == rf123); } } } @@ -211,12 +211,12 @@ public void checkIsResizeFactorIncorrect() { public void checkCompactOpsMemorySegmentToCompact() { MemorySegment skwseg, cskwseg1, cskwseg2, cskwseg3; CompactSketch csk1, csk2, csk3; - int lgK = 6; - UpdateSketch sk = Sketches.updateSketchBuilder().setLogNominalEntries(lgK).build(); - int n = 1 << (lgK + 1); + final int lgK = 6; + final UpdateSketch sk = Sketches.updateSketchBuilder().setLogNominalEntries(lgK).build(); + final int n = 1 << lgK + 1; for (int i = 2; i < n; i++) { sk.update(i); } - int cbytes = sk.getCompactBytes(); - byte[] byteArr = sk.toByteArray(); + final int cbytes = sk.getCompactBytes(); + final byte[] byteArr = sk.toByteArray(); skwseg = MemorySegment.ofArray(byteArr); cskwseg1 = MemorySegment.ofArray(new byte[cbytes]); cskwseg2 = MemorySegment.ofArray(new byte[cbytes]); @@ -236,7 +236,7 @@ public void printlnTest() { /** * @param s value to print */ - static void println(String s) { + static void println(final String s) { //System.out.println(s); //disable here } } From 661220057cccac1e467bf809ad21c5a8e7647703 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 25 Sep 2025 18:16:42 -0700 Subject: [PATCH 15/26] Take advantage of Java 25 Flexible Constructor Bodies JEP 513. And remove the awkward constructors used to avoid Finalizer Attacks. --- .../quantiles/DoublesSketchAccessor.java | 14 ++------ .../datasketches/tuple/QuickSelectSketch.java | 24 ++++--------- ...DirectArrayOfDoublesQuickSelectSketch.java | 36 +++---------------- 3 files changed, 12 insertions(+), 62 deletions(-) diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java index f4991d658..41355f63d 100644 --- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java +++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketchAccessor.java @@ -44,26 +44,16 @@ abstract class DoublesSketchAccessor extends DoublesBufferAccessor { final DoublesSketch ds, final boolean forceSize, final int level) { - this(checkLvl(level), ds, forceSize, level); - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - } - - private DoublesSketchAccessor( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final DoublesSketch ds, - final boolean forceSize, - final int level) { + checkLvl(level); ds_ = ds; forceSize_ = forceSize; setLevel(level); } - private static final boolean checkLvl(final int level) { + private static final void checkLvl(final int level) { if ((level != BB_LVL_IDX) && (level < 0)) { throw new SketchesArgumentException("Parameter level is < 0."); } - return true; } /** diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java index c264366a4..92355a77b 100644 --- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java @@ -135,8 +135,9 @@ private QuickSelectSketch( final float samplingProbability, final SummaryFactory summaryFactory, final int startingSize) { + final long thetaLong = (long) (Long.MAX_VALUE * (double) samplingProbability); super( - (long) (Long.MAX_VALUE * (double) samplingProbability), + thetaLong, true, summaryFactory); nomEntries_ = ceilingPowerOf2(nomEntries); @@ -182,23 +183,9 @@ private QuickSelectSketch( final MemorySegment seg, final SummaryDeserializer deserializer, final SummaryFactory summaryFactory) { - this(new Validate<>(), seg, deserializer, summaryFactory); - } - - /* - * This private constructor is used to protect against "Finalizer attacks". - * The private static inner class Validate performs validation and deserialization - * from the input MemorySegment and may throw exceptions. In order to protect against the attack, we must - * perform this validation prior to the constructor's super reaches the Object class. - * Making QuickSelectSketch final won't work here because UpdatableSketch is a subclass. - * Using an empty final finalizer() is not recommended and is deprecated as of Java9. - */ - private QuickSelectSketch( - final Validate val, - final MemorySegment seg, - final SummaryDeserializer deserializer, - final SummaryFactory summaryFactory) { - super(val.validate(seg, deserializer), val.myEmpty, summaryFactory); + //this(new Validate<>(), seg, deserializer, summaryFactory); + final Validate val = new Validate<>(); + final long thetaLong = val.validate(seg, deserializer); nomEntries_ = val.myNomEntries; lgResizeFactor_ = val.myLgResizeFactor; samplingProbability_ = val.mySamplingProbability; @@ -207,6 +194,7 @@ private QuickSelectSketch( rebuildThreshold_ = val.myRebuildThreshold; hashTable_ = val.myHashTable; summaryTable_ = val.mySummaryTable; + super(thetaLong, val.myEmpty, summaryFactory); } private static final class Validate { diff --git a/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java index 344df137c..f32571d0b 100644 --- a/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java @@ -79,25 +79,7 @@ class DirectArrayOfDoublesQuickSelectSketch extends ArrayOfDoublesQuickSelectSke final int numValues, final long seed, final MemorySegment dstSeg) { - this(checkMemorySegment(nomEntries, lgResizeFactor, numValues, dstSeg), - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - nomEntries, - lgResizeFactor, - samplingProbability, - numValues, - seed, - dstSeg); - } - - private DirectArrayOfDoublesQuickSelectSketch( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final int nomEntries, - final int lgResizeFactor, - final float samplingProbability, - final int numValues, - final long seed, - final MemorySegment dstSeg) { + checkMemorySegment(nomEntries, lgResizeFactor, numValues, dstSeg); super(numValues, seed); seg_ = dstSeg; final int startingCapacity = Util.getStartingCapacity(nomEntries, lgResizeFactor); @@ -126,14 +108,13 @@ private DirectArrayOfDoublesQuickSelectSketch( setRebuildThreshold(); } - private static final boolean checkMemorySegment( + private static final void checkMemorySegment( final int nomEntries, final int lgResizeFactor, final int numValues, final MemorySegment dstSeg) { final int startingCapacity = Util.getStartingCapacity(nomEntries, lgResizeFactor); checkMemorySegmentSize(dstSeg, startingCapacity, numValues); - return true; } /** @@ -144,15 +125,7 @@ private static final boolean checkMemorySegment( DirectArrayOfDoublesQuickSelectSketch( final MemorySegment seg, final long seed) { - this(checkSerVer(seg), seg, seed); - //SpotBugs CT_CONSTRUCTOR_THROW is false positive. - //this construction scheme is compliant with SEI CERT Oracle Coding Standard for Java / OBJ11-J - } - - private DirectArrayOfDoublesQuickSelectSketch( - @SuppressWarnings("unused") final boolean secure, //required part of Finalizer Attack prevention - final MemorySegment seg, - final long seed) { + checkSerVer(seg); super(seg.get(JAVA_BYTE, NUM_VALUES_BYTE), seed); seg_ = seg; SerializerDeserializer.validateFamily(seg.get(JAVA_BYTE, FAMILY_ID_BYTE), @@ -170,13 +143,12 @@ private DirectArrayOfDoublesQuickSelectSketch( setRebuildThreshold(); } - private static final boolean checkSerVer(final MemorySegment seg) { + private static final void checkSerVer(final MemorySegment seg) { final byte version = seg.get(JAVA_BYTE, SERIAL_VERSION_BYTE); if (version != serialVersionUID) { throw new SketchesArgumentException("Serial version mismatch. Expected: " + serialVersionUID + ", actual: " + version); } - return true; } @Override From 77da42d812c372d99f8c74422ec43494e1ba17cf Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 30 Sep 2025 13:47:35 -0700 Subject: [PATCH 16/26] This set of changes removed the largely redundant theta Sketches class. --- .../datasketches/theta/CompactOperations.java | 3 +- .../datasketches/theta/CompactSketch.java | 6 +- .../theta/DirectCompactCompressedSketch.java | 11 +- .../theta/DirectCompactSketch.java | 15 +- .../theta/DirectQuickSelectSketch.java | 7 +- .../theta/DirectQuickSelectSketchR.java | 5 +- .../theta/ForwardCompatibility.java | 5 +- .../datasketches/theta/HeapAlphaSketch.java | 27 +- .../datasketches/theta/HeapCompactSketch.java | 1 + .../theta/HeapQuickSelectSketch.java | 3 +- .../datasketches/theta/Intersection.java | 90 ---- .../datasketches/theta/IntersectionImpl.java | 112 ++++- .../theta/MemorySegmentHashIterator.java | 2 +- .../datasketches/theta/PreambleUtil.java | 2 +- .../datasketches/theta/SingleItemSketch.java | 3 +- .../org/apache/datasketches/theta/Sketch.java | 141 ++++-- .../apache/datasketches/theta/Sketches.java | 406 ------------------ .../apache/datasketches/theta/UnionImpl.java | 23 +- .../datasketches/theta/UpdateSketch.java | 16 +- .../datasketches/theta/AnotBimplTest.java | 23 +- .../datasketches/theta/CompactSketchTest.java | 54 +-- ...ConcurrentDirectQuickSelectSketchTest.java | 13 +- .../ConcurrentHeapQuickSelectSketchTest.java | 13 +- .../theta/DirectIntersectionTest.java | 44 +- .../theta/DirectQuickSelectSketchTest.java | 24 +- .../datasketches/theta/DirectUnionTest.java | 31 +- .../apache/datasketches/theta/EmptyTest.java | 42 +- .../theta/ForwardCompatibilityTest.java | 23 +- .../theta/HeapAlphaSketchTest.java | 12 +- .../theta/HeapIntersectionTest.java | 30 +- .../theta/HeapQuickSelectSketchTest.java | 16 +- .../datasketches/theta/HeapUnionTest.java | 19 +- .../theta/HeapifyWrapSerVer1and2Test.java | 133 +++--- .../datasketches/theta/IteratorTest.java | 10 +- .../theta/PairwiseSetOperationsTest.java | 13 +- .../theta/ReadOnlyMemorySegmentTest.java | 4 +- .../datasketches/theta/SetOperationTest.java | 34 +- .../theta/SetOpsCornerCasesTest.java | 35 +- .../theta/SingleItemSketchTest.java | 57 ++- ...{SketchesTest.java => SketchMiscTest.java} | 76 ++-- .../apache/datasketches/theta/SketchTest.java | 7 +- .../datasketches/theta/UnionImplTest.java | 50 +-- .../datasketches/theta/UpdateSketchTest.java | 9 +- ...BoundsOnRatiosInThetaSketchedSetsTest.java | 13 +- 44 files changed, 612 insertions(+), 1051 deletions(-) delete mode 100644 src/main/java/org/apache/datasketches/theta/Sketches.java rename src/test/java/org/apache/datasketches/theta/{SketchesTest.java => SketchMiscTest.java} (62%) diff --git a/src/main/java/org/apache/datasketches/theta/CompactOperations.java b/src/main/java/org/apache/datasketches/theta/CompactOperations.java index 54ec7d10f..9d23f7263 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactOperations.java +++ b/src/main/java/org/apache/datasketches/theta/CompactOperations.java @@ -33,7 +33,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; @@ -122,7 +121,7 @@ static CompactSketch segmentToCompact( final MemorySegment dstWSeg) { //extract Pre0 fields and Flags from srcMem - final int srcPreLongs = extractPreLongs(srcSeg); + final int srcPreLongs = Sketch.getPreambleLongs(srcSeg); final int srcSerVer = extractSerVer(srcSeg); //not used final int srcFamId = extractFamilyID(srcSeg); final int srcLgArrLongs = extractLgArrLongs(srcSeg); diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index f597db7d1..da890d9e3 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -35,7 +35,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractNumEntriesBytesV4; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4; @@ -78,7 +77,8 @@ public abstract class CompactSketch extends Sketch { * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg) { - return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED, false); + //final boolean checkSeedHash = extractSerVer(srcSeg) != 1; + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED, false); //false for SerVer 1 only } /** @@ -459,7 +459,7 @@ private byte[] toByteArrayV4() { } private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { - final int preLongs = extractPreLongs(srcSeg); + final int preLongs = Sketch.getPreambleLongs(srcSeg); final int entryBits = extractEntryBitsV4(srcSeg); final int numEntriesBytes = extractNumEntriesBytesV4(srcSeg); final short seedHash = (short) extractSeedHash(srcSeg); diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java index 2bf154215..55d7aa31e 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java @@ -22,7 +22,6 @@ import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4; import static org.apache.datasketches.theta.PreambleUtil.extractNumEntriesBytesV4; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4; import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits; @@ -70,12 +69,12 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe MemorySegment.copy(seg_, 0, dstSeg, 0, getCurrentBytes()); return new DirectCompactSketch(dstSeg); } - return CompactSketch.heapify(seg_); + return CompactSketch.heapify(seg_, Util.DEFAULT_UPDATE_SEED); } @Override public int getCurrentBytes() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); final int entryBits = extractEntryBitsV4(seg_); final int numEntriesBytes = extractNumEntriesBytesV4(seg_); return preLongs * Long.BYTES + numEntriesBytes + wholeBytesToHoldBits(getRetainedEntries() * entryBits); @@ -89,7 +88,7 @@ public int getRetainedEntries(final boolean valid) { //compact is always valid // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used - final int preLongs = extractPreLongs(seg_); // if > 1 then the second long has theta + final int preLongs = Sketch.getPreambleLongs(seg_); // if > 1 then the second long has theta final int numEntriesBytes = extractNumEntriesBytesV4(seg_); int offsetBytes = preLongs > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE; int numEntries = 0; @@ -101,7 +100,7 @@ public int getRetainedEntries(final boolean valid) { //compact is always valid @Override public long getThetaLong() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs > 1) ? extractThetaLongV4(seg_) : Long.MAX_VALUE; } @@ -119,7 +118,7 @@ public boolean isOrdered() { public HashIterator iterator() { return new MemorySegmentCompactCompressedHashIterator( seg_, - (extractPreLongs(seg_) > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE) + (Sketch.getPreambleLongs(seg_) > 1 ? START_PACKED_DATA_ESTIMATION_MODE : START_PACKED_DATA_EXACT_MODE) + extractNumEntriesBytesV4(seg_), extractEntryBitsV4(seg_), getRetainedEntries() diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java index 2fcbf08d6..b289a1dbf 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java @@ -26,7 +26,6 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; @@ -53,7 +52,7 @@ class DirectCompactSketch extends CompactSketch { /** * Construct this sketch with the given MemorySegment. - * @param seg Read-only MemorySegment object with the order bit properly set. + * @param seg (optional) Read-only MemorySegment object. */ DirectCompactSketch(final MemorySegment seg) { seg_ = seg; @@ -82,7 +81,7 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe @Override public int getCurrentBytes() { if (otherCheckForSingleItem(seg_)) { return 16; } - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); return (preLongs + curCount) << 3; } @@ -90,13 +89,13 @@ public int getCurrentBytes() { @Override public int getRetainedEntries(final boolean valid) { //compact is always valid if (otherCheckForSingleItem(seg_)) { return 1; } - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs == 1) ? 0 : extractCurCount(seg_); } @Override public long getThetaLong() { - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs > 2) ? extractThetaLong(seg_) : Long.MAX_VALUE; } @@ -148,7 +147,7 @@ public byte[] toByteArray() { @Override long[] getCache() { if (otherCheckForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } - final int preLongs = extractPreLongs(seg_); + final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); if (curCount > 0) { final long[] cache = new long[curCount]; @@ -160,12 +159,12 @@ long[] getCache() { @Override int getCompactPreambleLongs() { - return extractPreLongs(seg_); + return Sketch.getPreambleLongs(seg_); } @Override int getCurrentPreambleLongs() { - return extractPreLongs(seg_); + return Sketch.getPreambleLongs(seg_); } @Override diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 5980f7bbb..3480ac2ea 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -33,8 +33,7 @@ import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; -import static org.apache.datasketches.theta.PreambleUtil.getSegBytes; +import static org.apache.datasketches.theta.PreambleUtil.getUpdatableSegBytes; import static org.apache.datasketches.theta.PreambleUtil.insertCurCount; import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID; import static org.apache.datasketches.theta.PreambleUtil.insertFlags; @@ -132,7 +131,7 @@ private DirectQuickSelectSketch( final int lgArrLongs = lgRF == 0 ? lgNomLongs + 1 : ThetaUtil.MIN_LG_ARR_LONGS; //check Segment capacity - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); + final int minReqBytes = getUpdatableSegBytes(lgArrLongs, preambleLongs); final long curSegCapBytes = dstSeg.byteSize(); if (curSegCapBytes < minReqBytes) { throw new SketchesArgumentException( @@ -176,7 +175,7 @@ static DirectQuickSelectSketch writableWrap( final MemorySegment srcSeg, final MemorySegmentRequest mSegReq, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index e4f7fcaf1..e3f7197cc 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -37,7 +37,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong; @@ -103,7 +102,7 @@ private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) { * @return instance of this sketch */ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 @@ -240,7 +239,7 @@ int getCompactPreambleLongs() { @Override int getCurrentPreambleLongs() { - return PreambleUtil.extractPreLongs(wseg_); + return Sketch.getPreambleLongs(wseg_); } @Override diff --git a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java index 723a8b651..ec81f7c2e 100644 --- a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java +++ b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java @@ -22,7 +22,6 @@ import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import java.lang.foreign.MemorySegment; @@ -56,7 +55,7 @@ private ForwardCompatibility() { } */ static final CompactSketch heapify1to3(final MemorySegment srcSeg, final short seedHash) { final int segCap = (int) srcSeg.byteSize(); - final int preLongs = extractPreLongs(srcSeg); //always 3 for serVer 1 + final int preLongs = Sketch.getPreambleLongs(srcSeg); //always 3 for serVer 1 if (preLongs != 3) { throw new SketchesArgumentException("PreLongs must be 3 for SerVer 1: " + preLongs); } @@ -97,7 +96,7 @@ static final CompactSketch heapify1to3(final MemorySegment srcSeg, final short s */ static final CompactSketch heapify2to3(final MemorySegment srcSeg, final short seedHash) { final int segCap = (int) srcSeg.byteSize(); - final int preLongs = extractPreLongs(srcSeg); //1,2 or 3 + final int preLongs = Sketch.getPreambleLongs(srcSeg); //1,2 or 3 final int familyId = extractFamilyID(srcSeg); //1,2,3,4 if ((familyId < 1) || (familyId > 4)) { throw new SketchesArgumentException("Family (Sketch Type) must be 1 to 4: " + familyId); diff --git a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java index 8a35631ab..6aeb09401 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java @@ -23,6 +23,7 @@ import static java.lang.Math.min; import static java.lang.Math.sqrt; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; +import static org.apache.datasketches.common.Util.DEFAULT_UPDATE_SEED; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.checkBounds; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; @@ -31,7 +32,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor; import static org.apache.datasketches.theta.PreambleUtil.extractP; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountNotIncremented; @@ -112,6 +112,17 @@ static HeapAlphaSketch newHeapInstance(final int lgNomLongs, final long seed, fi return has; } + /** + * Heapify a sketch from a MemorySegment object containing sketch data. + * @param srcSeg The source MemorySegment object. + * It must have a size of at least 24 bytes. + * The assumed seed is {@link org.apache.datasketches.common.Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} + * @return instance of this sketch + */ + static HeapAlphaSketch heapifyInstance(final MemorySegment srcSeg) { + return heapifyInstance(srcSeg, DEFAULT_UPDATE_SEED); + } + /** * Heapify a sketch from a MemorySegment object containing sketch data. * @param srcSeg The source MemorySegment object. @@ -123,7 +134,7 @@ static HeapAlphaSketch newHeapInstance(final int lgNomLongs, final long seed, fi static HeapAlphaSketch heapifyInstance(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); checkBounds(0, 24, srcSeg.byteSize()); - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 @@ -234,14 +245,14 @@ public boolean isEmpty() { *
        * Long || Start Byte Adr:
        * Adr:
    -   *      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |        0           |
    -   *  0   ||    Seed Hash    | Flags  |  LgArr | LgNom  | FamID  | SerVer | lgRF | PreLongs=3  |
    +   *      ||    7   |    6   |    5   |    4   |    3   |    2     |    1     |        0           |
    +   *  0   ||    Seed Hash    | Flags  |  LgArr | LgNom  | FamID=1  | SerVer=3 | lgRF | PreLongs=3  |
        *
    -   *      ||   15   |   14   |   13   |   12   |   11   |   10   |    9   |     8              |
    -   *  1   ||-----------------p-----------------|----------Retained Entries Count---------------|
    +   *      ||   15   |   14   |   13   |   12   |   11   |   10     |    9     |     8              |
    +   *  1   ||-----------------p-----------------|----------Retained Entries Count-------------------|
        *
    -   *      ||   23   |   22   |   21    |  20   |   19   |   18   |   17   |    16              |
    -   *  2   ||---------------------------------Theta---------------------------------------------|
    +   *      ||   23   |   22   |   21    |  20   |   19   |   18     |   17     |    16              |
    +   *  2   ||---------------------------------Theta-------------------------------------------------|
        * 
    */ diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java index fdd2860ce..50c419e61 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java @@ -57,6 +57,7 @@ final class HeapCompactSketch extends CompactSketch { * @param curCount correct value * @param thetaLong The correct * thetaLong. + * @param ordered true if cache is ordered. */ HeapCompactSketch(final long[] cache, final boolean empty, final short seedHash, final int curCount, final long thetaLong, final boolean ordered) { diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java index 5d8af6bfb..b51273404 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java @@ -30,7 +30,6 @@ import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor; import static org.apache.datasketches.theta.PreambleUtil.extractP; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented; import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncrementedRebuilt; @@ -108,7 +107,7 @@ private HeapQuickSelectSketch(final int lgNomLongs, final long seed, final float * @return instance of this sketch */ static HeapQuickSelectSketch heapifyInstance(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = extractPreLongs(srcSeg); //byte 0 + final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 diff --git a/src/main/java/org/apache/datasketches/theta/Intersection.java b/src/main/java/org/apache/datasketches/theta/Intersection.java index a31dc3ef9..134c49ff6 100644 --- a/src/main/java/org/apache/datasketches/theta/Intersection.java +++ b/src/main/java/org/apache/datasketches/theta/Intersection.java @@ -20,23 +20,13 @@ package org.apache.datasketches.theta; import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static org.apache.datasketches.common.Util.floorPowerOf2; -import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER; import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; -import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; -import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; -import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import java.lang.foreign.MemorySegment; -import java.util.Arrays; import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.thetacommon.ThetaUtil; /** * The API for intersection operations @@ -164,84 +154,4 @@ public static Intersection wrap(final MemorySegment srcSeg, final long expectedS return IntersectionImpl.wrapInstance(srcSeg, expectedSeed, srcSeg.isReadOnly() ); } - // Restricted - - /** - * Returns the maximum lgArrLongs given the capacity of the MemorySegment. - * @param dstSeg the given MemorySegment - * @return the maximum lgArrLongs given the capacity of the MemorySegment - */ - protected static int getMaxLgArrLongs(final MemorySegment dstSeg) { - final int preBytes = CONST_PREAMBLE_LONGS << 3; - final long cap = dstSeg.byteSize(); - return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3); - } - - protected static void checkMinSizeMemorySegment(final MemorySegment seg) { - final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280 - final long cap = seg.byteSize(); - if (cap < minBytes) { - throw new SketchesArgumentException( - "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap); - } - } - - /** - * Compact first 2^lgArrLongs of given array - * @param srcCache anything - * @param lgArrLongs The correct - * lgArrLongs. - * @param curCount must be correct - * @param thetaLong The correct - * thetaLong. - * @param dstOrdered true if output array must be sorted - * @return the compacted array - */ //Only used in IntersectionImpl & Test - static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs, - final int curCount, final long thetaLong, final boolean dstOrdered) { - if (curCount == 0) { - return new long[0]; - } - final long[] cacheOut = new long[curCount]; - final int len = 1 << lgArrLongs; - int j = 0; - for (int i = 0; i < len; i++) { - final long v = srcCache[i]; - if (v <= 0L || v >= thetaLong ) { continue; } - cacheOut[j++] = v; - } - assert curCount == j; - if (dstOrdered) { - Arrays.sort(cacheOut); - } - return cacheOut; - } - - protected static void segChecks(final MemorySegment srcSeg) { - //Get Preamble - //Note: Intersection does not use lgNomLongs (or k), per se. - //seedHash loaded and checked in private constructor - final int preLongs = extractPreLongs(srcSeg); - final int serVer = extractSerVer(srcSeg); - final int famID = extractFamilyID(srcSeg); - final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0; - final int curCount = extractCurCount(srcSeg); - //Checks - if (preLongs != CONST_PREAMBLE_LONGS) { - throw new SketchesArgumentException( - "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs); - } - if (serVer != SER_VER) { - throw new SketchesArgumentException("Serialization Version must equal " + SER_VER); - } - Family.INTERSECTION.checkFamilyID(famID); - if (empty) { - if (curCount != 0) { - throw new SketchesArgumentException( - "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount); - } - //empty = true AND curCount_ = 0: OK - } //else empty = false, curCount could be anything - } - } diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java index ebb4a6215..6819524b1 100644 --- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java @@ -26,6 +26,7 @@ import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED; import static org.apache.datasketches.common.Util.clearBits; +import static org.apache.datasketches.common.Util.floorPowerOf2; import static org.apache.datasketches.common.Util.setBits; import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; @@ -41,8 +42,10 @@ import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.clearEmpty; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; +import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; +import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; import static org.apache.datasketches.theta.PreambleUtil.insertCurCount; import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID; @@ -81,17 +84,17 @@ * @author Kevin Lang */ final class IntersectionImpl extends Intersection { - protected final short seedHash_; - protected final boolean readOnly_; //True if this sketch is to be treated as read only - protected final MemorySegment wseg_; - protected final int maxLgArrLongs_; //only used with MemorySegment, not serialized + private final short seedHash_; + private final boolean readOnly_; //True if this sketch is to be treated as read only + private final MemorySegment wseg_; + private final int maxLgArrLongs_; //only used with MemorySegment, not serialized //Note: Intersection does not use lgNomLongs or k, per se. - protected int lgArrLongs_; //current size of hash table - protected int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true - protected long thetaLong_; - protected boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE! - protected long[] hashTable_; //retained entries of the intersection, on-heap only. + private int lgArrLongs_; //current size of hash table + private int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true + private long thetaLong_; + private boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE! + private long[] hashTable_; //retained entries of the intersection, on-heap only. /** * Constructor: Sets the class finals and computes, sets and checks the seedHash. @@ -100,14 +103,14 @@ final class IntersectionImpl extends Intersection { * @param dstSegFlag The given MemorySegment is a Destination (new offHeap) MemorySegment. * @param readOnly True if MemorySegment is to be treated as read only. */ - protected IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag, + private IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag, final boolean readOnly) { readOnly_ = readOnly; if (wseg != null) { wseg_ = wseg; if (dstSegFlag) { //DstSeg: compute & store seedHash, no seedHash checking - checkMinSizeMemorySegment(wseg); - maxLgArrLongs_ = !readOnly ? getMaxLgArrLongs(wseg) : 0; //Only Off Heap + IntersectionImpl.checkMinSizeMemorySegment(wseg); + maxLgArrLongs_ = !readOnly ? IntersectionImpl.getMaxLgArrLongs(wseg) : 0; //Only Off Heap seedHash_ = Util.computeSeedHash(seed); wseg_.set(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT, seedHash_); } else { //SrcSeg:gets and stores the seedHash, checks seg_seedHash against the seed @@ -179,8 +182,8 @@ static IntersectionImpl initNewDirectInstance(final long seed, final MemorySegme static IntersectionImpl heapifyInstance(final MemorySegment srcSeg, final long seed) { final boolean dstSegFlag = false; final boolean readOnly = false; + IntersectionImpl.segChecks(srcSeg); final IntersectionImpl impl = new IntersectionImpl(null, seed, dstSegFlag, readOnly); - segChecks(srcSeg); //Initialize impl.lgArrLongs_ = extractLgArrLongs(srcSeg); @@ -207,8 +210,8 @@ static IntersectionImpl wrapInstance( final long seed, final boolean readOnly) { final boolean dstSegFlag = false; + IntersectionImpl.segChecks(srcSeg); final IntersectionImpl impl = new IntersectionImpl(srcSeg, seed, dstSegFlag, readOnly); - segChecks(srcSeg); impl.lgArrLongs_ = extractLgArrLongs(srcSeg); impl.curCount_ = extractCurCount(srcSeg); impl.thetaLong_ = extractThetaLong(srcSeg); @@ -333,7 +336,7 @@ public CompactSketch getResult(final boolean dstOrdered, final MemorySegment dst } else { hashTable = hashTable_; } - compactCache = compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered); + compactCache = IntersectionImpl.compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered); srcCompact = true; srcOrdered = dstOrdered; return CompactOperations.componentsToCompact( @@ -561,4 +564,83 @@ private void resetCommon() { thetaLong_ = Long.MAX_VALUE; hashTable_ = null; } + + /** + * Compact first 2^lgArrLongs of given array + * @param srcCache anything + * @param lgArrLongs The correct + * lgArrLongs. + * @param curCount must be correct + * @param thetaLong The correct + * thetaLong. + * @param dstOrdered true if output array must be sorted + * @return the compacted array + */ //used in Test + static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs, + final int curCount, final long thetaLong, final boolean dstOrdered) { + if (curCount == 0) { + return new long[0]; + } + final long[] cacheOut = new long[curCount]; + final int len = 1 << lgArrLongs; + int j = 0; + for (int i = 0; i < len; i++) { + final long v = srcCache[i]; + if (v <= 0L || v >= thetaLong ) { continue; } + cacheOut[j++] = v; + } + assert curCount == j; + if (dstOrdered) { + Arrays.sort(cacheOut); + } + return cacheOut; + } + + private static void checkMinSizeMemorySegment(final MemorySegment seg) { + final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280 + final long cap = seg.byteSize(); + if (cap < minBytes) { + throw new SketchesArgumentException( + "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap); + } + } + + /** + * Returns the maximum lgArrLongs given the capacity of the MemorySegment. + * @param dstSeg the given MemorySegment + * @return the maximum lgArrLongs given the capacity of the MemorySegment + */ + private static int getMaxLgArrLongs(final MemorySegment dstSeg) { + final int preBytes = CONST_PREAMBLE_LONGS << 3; + final long cap = dstSeg.byteSize(); + return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3); + } + + private static void segChecks(final MemorySegment srcSeg) { + //Get Preamble + //Note: Intersection does not use lgNomLongs (or k), per se. + //seedHash loaded and checked in private constructor + final int preLongs = Sketch.getPreambleLongs(srcSeg); + final int serVer = extractSerVer(srcSeg); + final int famID = extractFamilyID(srcSeg); + final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0; + final int curCount = extractCurCount(srcSeg); + //Checks + if (preLongs != CONST_PREAMBLE_LONGS) { + throw new SketchesArgumentException( + "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs); + } + if (serVer != SER_VER) { + throw new SketchesArgumentException("Serialization Version must equal " + SER_VER); + } + Family.INTERSECTION.checkFamilyID(famID); + if (empty) { + if (curCount != 0) { + throw new SketchesArgumentException( + "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount); + } + //empty = true AND curCount_ = 0: OK + } //else empty = false, curCount could be anything + } + } diff --git a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java index 548c79ef3..53344c8d6 100644 --- a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java +++ b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java @@ -38,7 +38,7 @@ final class MemorySegmentHashIterator implements HashIterator { this.seg = srcSeg; this.arrLongs = arrLongs; this.thetaLong = thetaLong; - offsetBytes = PreambleUtil.extractPreLongs(srcSeg) << 3; + offsetBytes = Sketch.getPreambleLongs(srcSeg) << 3; index = -1; hash = 0; } diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index 294682e04..19dec2061 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -233,7 +233,7 @@ private PreambleUtil() {} * @param preambleLongs current preamble size * @return the size in bytes */ - static int getSegBytes(final int lgArrLongs, final int preambleLongs) { + static int getUpdatableSegBytes(final int lgArrLongs, final int preambleLongs) { return (8 << lgArrLongs) + (preambleLongs << 3); } diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java index abf8df391..062c6d86d 100644 --- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java +++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java @@ -26,7 +26,6 @@ import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; @@ -385,7 +384,7 @@ short getSeedHash() { } static boolean otherCheckForSingleItem(final MemorySegment seg) { - return otherCheckForSingleItem(extractPreLongs(seg), extractSerVer(seg), + return otherCheckForSingleItem(Sketch.getPreambleLongs(seg), extractSerVer(seg), extractFamilyID(seg), extractFlags(seg) ); } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 8ff9c13d0..bc944478d 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -20,16 +20,24 @@ package org.apache.datasketches.theta; import static java.lang.foreign.ValueLayout.JAVA_BYTE; +import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; +import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.LS; import static org.apache.datasketches.common.Util.ceilingPowerOf2; import static org.apache.datasketches.common.Util.zeroPad; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; +import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; +import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; +import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; +import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; +import static org.apache.datasketches.theta.PreambleUtil.getAndCheckPreLongs; import static org.apache.datasketches.thetacommon.HashOperations.count; import java.lang.foreign.MemorySegment; @@ -70,12 +78,12 @@ public abstract class Sketch implements MemorySegmentStatus { * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg) { - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); - if (family == Family.COMPACT) { - return CompactSketch.heapify(srcSeg); +// return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.COMPACT.getID()) { + return CompactSketch.heapify(srcSeg);//, Util.DEFAULT_UPDATE_SEED); } - return heapifyUpdateFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED); + return heapifyUpdateSketchFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -83,13 +91,14 @@ public static Sketch heapify(final MemorySegment srcSeg) { * *

    The resulting sketch will not retain any link to the source MemorySegment.

    * - *

    For Update Sketches this method checks if the - * Default Update Seed

    - * was used to create the source MemorySegment image. + *

    For Update Sketches this method checks if the expectedSeed + * was used to create the source MemorySegment image.

    * *

    For Compact Sketches this method assumes that the sketch image was created with the * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.

    * + *

    Note: This assumes only SerVer 3 and later.

    + * * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. @@ -102,7 +111,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed if (family == Family.COMPACT) { return CompactSketch.heapify(srcSeg, expectedSeed); } - return heapifyUpdateFromMemorySegment(srcSeg, expectedSeed); + return heapifyUpdateSketchFromMemorySegment(srcSeg, expectedSeed); } /** @@ -275,6 +284,20 @@ public int getCountLessThanThetaLong(final long thetaLong) { */ public abstract double getEstimate(); + /** + * Gets the estimate from the given MemorySegment + * @param srcSeg the given MemorySegment + * @return the result estimate + */ + public static double getEstimate(final MemorySegment srcSeg) { + final int famId = extractFamilyID(srcSeg); + if (!isValidSketchID(famId)) { + throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " + + Family.idToFamily(famId).toString()); + } + return Sketch.estimate(extractThetaLong(srcSeg), getRetainedEntries(srcSeg)); + } + /** * Returns the Family that this sketch belongs to * @return the Family that this sketch belongs to @@ -351,6 +374,30 @@ public int getRetainedEntries() { return getRetainedEntries(true); } + /** + * Returns the number of valid entries that have been retained by the sketch from the given MemorySegment + * @param srcSeg the given MemorySegment that has an image of a Sketch + * @return the number of valid retained entries + */ + public static int getRetainedEntries(final MemorySegment srcSeg) { + final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); + if (serVer == 1) { + final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); + if (Sketch.getThetaLong(srcSeg) == Long.MAX_VALUE && entries == 0) { + return 0; + } + return entries; + } + //SerVer 2 or 3 + final int preLongs = Sketch.getPreambleLongs(srcSeg); + final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 + if (preLongs == 1) { + return empty ? 0 : 1; + } + //preLongs > 1 + return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 + } + /** * Returns the number of entries that have been retained by the sketch. * @param valid if true, returns the number of valid entries, which are less than theta and used @@ -416,7 +463,7 @@ public double getUpperBound(final int numStdDev) { * @return true if the sketch is in estimation mode. */ public boolean isEstimationMode() { - return estMode(getThetaLong(), isEmpty()); + return getThetaLong() < Long.MAX_VALUE && !isEmpty(); } /** @@ -606,6 +653,23 @@ public static String toString(final MemorySegment seg) { */ abstract short getSeedHash(); + static boolean getEmpty(final MemorySegment srcSeg) { + final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); + if (serVer == 1) { + return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; + } + return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2,3,4 + } + + static int getPreambleLongs(final MemorySegment srcSeg) { + return getAndCheckPreLongs(srcSeg); //for SerVer 1,2,3,4 + } + + static long getThetaLong(final MemorySegment srcSeg) { + final int preLongs = Sketch.getPreambleLongs(srcSeg); + return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3,4 + } + /** * Returns true if given Family id is one of the theta sketches * @param id the given Family id @@ -617,44 +681,49 @@ static final boolean isValidSketchID(final int id) { || id == Family.COMPACT.getID(); } - /** - * Checks Ordered and Compact flags for integrity between sketch and a MemorySegment - * @param sketch the given sketch - */ - static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { - final MemorySegment seg = sketch.getMemorySegment(); - if (seg == null) { return; } - final int flags = PreambleUtil.extractFlags(seg); - if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Compact Flag inconsistent with Sketch"); - } - if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Ordered Flag inconsistent with Sketch"); - } - } - static final double estimate(final long thetaLong, final int curCount) { return curCount * (LONG_MAX_VALUE_AS_DOUBLE / thetaLong); } - static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, - final boolean empty) { + /** + * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the lower bound. + */ + public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { + return lowerBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + + static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getLowerBound(curCount, theta, numStdDev, empty); } + /** + * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the upper bound. + */ + public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { + return upperBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + static final double upperBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getUpperBound(curCount, theta, numStdDev, empty); } - private static final boolean estMode(final long thetaLong, final boolean empty) { - return thetaLong < Long.MAX_VALUE && !empty; - } - /** * Instantiates a Heap Update Sketch from MemorySegment. Only SerVer3. SerVer 1 & 2 already handled. * @param srcSeg the source MemorySegment @@ -662,7 +731,7 @@ private static final boolean estMode(final long thetaLong, final boolean empty) * See Update Hash Seed. * @return a Sketch */ - private static final Sketch heapifyUpdateFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { + private static final Sketch heapifyUpdateSketchFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { final long cap = srcSeg.byteSize(); if (cap < 8) { throw new SketchesArgumentException( diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java deleted file mode 100644 index 69d945c40..000000000 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; - -import java.lang.foreign.MemorySegment; - -import org.apache.datasketches.common.Family; -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.common.Util; - -/** - * This class brings together the common sketch and set operation creation methods and - * the public static methods into one place. - * - * @author Lee Rhodes - */ -public final class Sketches { - - private Sketches() {} - - /** - * Gets the unique count estimate from a valid MemorySegment image of a Sketch - * @param srcSeg the source MemorySegment - * @return the sketch's best estimate of the cardinality of the input stream. - */ - public static double getEstimate(final MemorySegment srcSeg) { - checkIfValidThetaSketch(srcSeg); - return Sketch.estimate(getThetaLong(srcSeg), getRetainedEntries(srcSeg)); - } - - /** - * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the lower bound. - */ - public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.lowerBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxAnotBResultBytes(int)}. - * Returns the maximum number of bytes for the returned CompactSketch, given the maximum - * value of nomEntries of the first sketch A of AnotB. - * @param maxNomEntries the given value - * @return the maximum number of bytes. - */ - public static int getMaxAnotBResultBytes(final int maxNomEntries) { - return SetOperation.getMaxAnotBResultBytes(maxNomEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch with the given - * number of actual entries. - * @param numberOfEntries the actual number of retained entries stored in the sketch. - * @return the maximum number of storage bytes required for a CompactSketch with the given number - * of retained entries. - */ - public static int getMaxCompactSketchBytes(final int numberOfEntries) { - return Sketch.getMaxCompactSketchBytes(numberOfEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch given the configured - * log_base2 of the number of nominal entries, which is a power of 2. - * @param lgNomEntries Nominal Entries - * @return the maximum number of storage bytes required for a CompactSketch with the given - * lgNomEntries. - * @see Sketch#getCompactSketchMaxBytes(int) - */ - public static int getCompactSketchMaxBytes(final int lgNomEntries) { - return Sketch.getCompactSketchMaxBytes(lgNomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxIntersectionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxIntersectionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxIntersectionBytes(int)} - */ - public static int getMaxIntersectionBytes(final int nomEntries) { - return SetOperation.getMaxIntersectionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxUnionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxUnionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxUnionBytes(int)} - */ - public static int getMaxUnionBytes(final int nomEntries) { - return SetOperation.getMaxUnionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - * @param nomEntries Ref: {@link Sketch#getMaxUpdateSketchBytes(int)}, {@code nomEntries} - * @return Ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - */ - public static int getMaxUpdateSketchBytes(final int nomEntries) { - return Sketch.getMaxUpdateSketchBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getSerializationVersion(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#getSerializationVersion(MemorySegment)}, {@code srcSeg} - * @return Ref: {@link Sketch#getSerializationVersion(MemorySegment)} - */ - public static int getSerializationVersion(final MemorySegment srcSeg) { - return Sketch.getSerializationVersion(srcSeg); - } - - /** - * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the upper bound. - */ - public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.upperBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - //Heapify Operations - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg) { - return SetOperation.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg) { - return Sketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg) { - return UpdateSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.heapify(srcSeg, expectedSeed); - } - - //Builders - - /** - * Ref: {@link SetOperationBuilder SetOperationBuilder} - * @return {@link SetOperationBuilder SetOperationBuilder} - */ - public static SetOperationBuilder setOperationBuilder() { - return new SetOperationBuilder(); - } - - /** - * Ref: {@link UpdateSketchBuilder UpdateSketchBuilder} - * @return {@link UpdateSketchBuilder UpdateSketchBuilder} - */ - public static UpdateSketchBuilder updateSketchBuilder() { - return new UpdateSketchBuilder(); - } - - //Wrap operations - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Intersection backed by the given MemorySegment - */ - public static Intersection wrapIntersection(final MemorySegment srcSeg) { - return (Intersection) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg) { - return wrapSetOperation(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg) { - return Sketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the expectedSeed used to validate the given MemorySegment image. - * Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} and casts the result to a Union - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Union backed by the given MemorySegment. - */ - public static Union wrapUnion(final MemorySegment srcSeg) { - return (Union) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg) { - return wrapUpdateSketch(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.wrap(srcSeg, null, expectedSeed); - } - - //Restricted static methods - - private static void checkIfValidThetaSketch(final MemorySegment srcSeg) { - final int fam = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - if (!Sketch.isValidSketchID(fam)) { - throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " - + Family.idToFamily(fam).toString()); - } - } - - static boolean getEmpty(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; - } - return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - } - - static int getPreambleLongs(final MemorySegment srcSeg) { - return srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //for SerVer 1,2,3 - } - - static int getRetainedEntries(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); - if (getThetaLong(srcSeg) == Long.MAX_VALUE && entries == 0) { - return 0; - } - return entries; - } - //SerVer 2 or 3 - final int preLongs = getPreambleLongs(srcSeg); - final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - if (preLongs == 1) { - return empty ? 0 : 1; - } - //preLongs > 1 - return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 - } - - static long getThetaLong(final MemorySegment srcSeg) { - final int preLongs = getPreambleLongs(srcSeg); - return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3 - } -} diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java index d921ec1ba..bbefd958c 100644 --- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java @@ -22,6 +22,8 @@ import static java.lang.Math.min; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.common.QuickSelect.selectExcludingZeros; +import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.UNION_THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.clearEmpty; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; @@ -34,6 +36,7 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.ResizeFactor; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; import org.apache.datasketches.thetacommon.HashOperations; @@ -279,7 +282,7 @@ public void union(final Sketch sketchIn) { gadget_.hashUpdate(sketchIn.getCache()[0]); return; } - Sketch.checkSketchAndMemorySegmentFlags(sketchIn); + UnionImpl.checkSketchAndMemorySegmentFlags(sketchIn); unionThetaLong_ = min(min(unionThetaLong_, sketchIn.getThetaLong()), gadget_.getThetaLong()); //Theta rule unionEmpty_ = false; @@ -372,4 +375,22 @@ boolean isEmpty() { return gadget_.isEmpty() && unionEmpty_; } + /** + * Checks Ordered and Compact flags for integrity between sketch and its MemorySegment + * @param sketch the given sketch + */ + private static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { + final MemorySegment seg = sketch.getMemorySegment(); + if (seg == null) { return; } + final int flags = PreambleUtil.extractFlags(seg); + if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Compact Flag inconsistent with Sketch"); + } + if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Ordered Flag inconsistent with Sketch"); + } + } + } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 8fe93e46c..4cd3a4cd4 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -39,7 +39,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractP; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.PreambleUtil.getSegBytes; +import static org.apache.datasketches.theta.PreambleUtil.getUpdatableSegBytes; import static org.apache.datasketches.theta.UpdateReturnState.RejectedNullOrEmpty; import java.lang.foreign.MemorySegment; @@ -162,8 +162,16 @@ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expect @Override public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstWSeg) { - return componentsToCompact(getThetaLong(), getRetainedEntries(true), getSeedHash(), isEmpty(), - false, false, dstOrdered, dstWSeg, getCache()); + return componentsToCompact( + getThetaLong(), + getRetainedEntries(true), + getSeedHash(), + isEmpty(), + false, //is src compact + false, //is src ordered + dstOrdered, + dstWSeg, + getCache()); } @Override @@ -463,7 +471,7 @@ static void checkSegIntegrity(final MemorySegment srcSeg, final long expectedSee //Check seg capacity, lgArrLongs final long curCapBytes = srcSeg.byteSize(); - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); + final int minReqBytes = getUpdatableSegBytes(lgArrLongs, preambleLongs); if (curCapBytes < minReqBytes) { throw new SketchesArgumentException( "Possible corruption: Current MemorySegment size < min required size: " diff --git a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java index e400dd1fa..6482712e8 100644 --- a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java +++ b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java @@ -25,16 +25,10 @@ import static org.testng.Assert.fail; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.AnotB; -import org.apache.datasketches.theta.AnotBimpl; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; import org.testng.annotations.Test; /** @@ -295,19 +289,18 @@ public void checkAnotBnotC_sameMemorySegment() { @Test public void checkAnotBsimple() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); - final UpdateSketch skB = Sketches.updateSketchBuilder().build(); - final AnotB aNotB = Sketches.setOperationBuilder().buildANotB(); + final UpdateSketch skA = UpdateSketch.builder().build(); + final UpdateSketch skB =UpdateSketch.builder().build(); + final AnotB aNotB = SetOperation.builder().buildANotB(); final CompactSketch csk = aNotB.aNotB(skA, skB); assertEquals(csk.getCurrentBytes(), 8); } @Test public void checkGetResult() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); - final UpdateSketch skB = Sketches.updateSketchBuilder().build(); - - final AnotB aNotB = Sketches.setOperationBuilder().buildANotB(); + final UpdateSketch skA = UpdateSketch.builder().build(); + final UpdateSketch skB = UpdateSketch.builder().build(); + final AnotB aNotB = SetOperation.builder().buildANotB(); final CompactSketch csk = aNotB.aNotB(skA, skB); assertEquals(csk.getCurrentBytes(), 8); } @@ -321,7 +314,7 @@ public void checkGetFamily() { @Test public void checkGetMaxBytes() { - final int bytes = Sketches.getMaxAnotBResultBytes(10); + final int bytes = SetOperation.getMaxAnotBResultBytes(10); assertEquals(bytes, 16 * 15 + 24); } diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java index 8ef889be4..fc35891b3 100644 --- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java @@ -26,25 +26,13 @@ import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; +import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.DirectCompactSketch; -import org.apache.datasketches.theta.EmptyCompactSketch; -import org.apache.datasketches.theta.HashIterator; -import org.apache.datasketches.theta.HeapCompactSketch; -import org.apache.datasketches.theta.Intersection; -import org.apache.datasketches.theta.SingleItemSketch; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.WrappedCompactCompressedSketch; -import org.apache.datasketches.theta.WrappedCompactSketch; import org.testng.annotations.Test; -import java.lang.foreign.Arena; - /** * @author Lee Rhodes */ @@ -186,7 +174,7 @@ private static void checkOtherCompactSketch(final Sketch testSk, final Sketch re @Test public void checkDirectSingleItemSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final int bytes = sk.getCompactBytes(); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); @@ -230,7 +218,7 @@ public void checkSegTooSmallOrdered() { @Test public void checkCompactCachePart() { //phony values except for curCount = 0. - final long[] result = Intersection.compactCachePart(null, 4, 0, 0L, false); + final long[] result = IntersectionImpl.compactCachePart(null, 4, 0, 0L, false); assertEquals(result.length, 0); } @@ -250,7 +238,7 @@ public void checkCompactCachePart() { * Empty, segment-based Compact sketches are always ordered */ public void checkEmptyMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]); final CompactSketch csk1 = sk.compact(false, wseg1); //the first parameter is ignored when empty @@ -290,7 +278,7 @@ public void checkEmptyMemorySegmentCompactSketch() { * Single-Item, segment-based Compact sketches are always ordered: */ public void checkSingleItemMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]); @@ -321,7 +309,7 @@ public void checkSingleItemMemorySegmentCompactSketch() { @Test public void checkMultipleItemMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); //This sequence is naturally out-of-order by the hash values. sk.update(1); sk.update(2); @@ -360,7 +348,7 @@ public void checkMultipleItemMemorySegmentCompactSketch() { * All empty, heap-based, compact sketches point to the same static, final constant of 8 bytes. */ public void checkEmptyHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when empty final State state1 = new State("EmptyCompactSketch", 0, 8, COMPACT, EMPTY, !DIRECT, !SEGMENT, ORDERED, !ESTIMATION); @@ -390,7 +378,7 @@ public void checkEmptyHeapCompactSketch() { * Single-Item, heap-based Compact sketches are always ordered. */ public void checkSingleItemHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when single item @@ -418,7 +406,7 @@ public void checkSingleItemHeapCompactSketch() { @Test public void checkMultipleItemHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); //This sequence is naturally out-of-order by the hash values. sk.update(1); sk.update(2); @@ -453,9 +441,9 @@ public void checkMultipleItemHeapCompactSketch() { @Test public void checkHeapifySingleItemSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); - final int bytes = Sketches.getMaxCompactSketchBytes(2); //1 more than needed + final int bytes = Sketch.getMaxCompactSketchBytes(2); //1 more than needed final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); sk.compact(false, wseg); final Sketch csk = Sketch.heapify(wseg); @@ -464,7 +452,7 @@ public void checkHeapifySingleItemSketch() { @Test public void checkHeapifyEmptySketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final MemorySegment wseg = MemorySegment.ofArray(new byte[16]); //empty, but extra bytes final CompactSketch csk = sk.compact(false, wseg); //ignores order because it is empty assertTrue(csk instanceof DirectCompactSketch); @@ -474,7 +462,7 @@ public void checkHeapifyEmptySketch() { @Test public void checkGetCache() { - final UpdateSketch sk = Sketches.updateSketchBuilder().setP((float).5).build(); + final UpdateSketch sk = UpdateSketch.builder().setP((float).5).build(); sk.update(7); final int bytes = sk.getCompactBytes(); final CompactSketch csk = sk.compact(true, MemorySegment.ofArray(new byte[bytes])); @@ -484,7 +472,7 @@ public void checkGetCache() { @Test public void checkHeapCompactSketchCompact() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); sk.update(2); final CompactSketch csk = sk.compact(); @@ -506,7 +494,7 @@ public void checkDirectCompactSketchCompact() { final int lgK = 6; //empty - final UpdateSketch sk = Sketches.updateSketchBuilder().setLogNominalEntries(lgK).build(); + final UpdateSketch sk = UpdateSketch.builder().setLogNominalEntries(lgK).build(); bytes = sk.getCompactBytes(); //empty, 8 bytes wseg1 = MemorySegment.ofArray(new byte[bytes]); wseg2 = MemorySegment.ofArray(new byte[bytes]); @@ -566,7 +554,7 @@ public void checkDirectCompactSketchCompact() { @Test public void serializeDeserializeHeapV4() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -583,7 +571,7 @@ public void serializeDeserializeHeapV4() { @Test public void serializeDeserializeDirectV4_segment() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -600,7 +588,7 @@ public void serializeDeserializeDirectV4_segment() { @Test public void serializeDeserializeDirectV4_bytes() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -618,7 +606,7 @@ public void serializeDeserializeDirectV4_bytes() { @Test public void serializeWrapBytesV3() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -636,7 +624,7 @@ public void serializeWrapBytesV3() { @Test public void serializeWrapBytesV4() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java index 28f78ac18..e0816b0e5 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java @@ -33,15 +33,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.ConcurrentDirectQuickSelectSketch; -import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer; -import org.apache.datasketches.theta.ConcurrentSharedThetaSketch; -import org.apache.datasketches.theta.DirectQuickSelectSketch; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.UpdateSketchBuilder; import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketchTest.SharedLocal; import org.apache.datasketches.thetacommon.HashOperations; import org.testng.annotations.Test; @@ -79,7 +70,7 @@ public void checkHeapifyMemorySegmentEstimating() { assertEquals(local.getClass().getSimpleName(), "ConcurrentHeapThetaBuffer"); //This sharedHeap is not linked to the concurrent local buffer - final UpdateSketch sharedHeap = Sketches.heapifyUpdateSketch(sl.wseg); + final UpdateSketch sharedHeap = UpdateSketch.heapify(sl.wseg); assertEquals(sharedHeap.getClass().getSimpleName(), "HeapQuickSelectSketch"); checkMemorySegmentDirectProxyMethods(local, shared); @@ -509,7 +500,7 @@ public void checkConstructReconstructFromMemorySegment() { serArr = shared.toByteArray(); final MemorySegment seg = MemorySegment.ofArray(serArr); - final UpdateSketch recoveredShared = Sketches.wrapUpdateSketch(seg); + final UpdateSketch recoveredShared = UpdateSketch.wrap(seg); //reconstruct to Native/Direct final int bytes = Sketch.getMaxUpdateSketchBytes(k); diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java index 7ba11c1c9..c354fd344 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java @@ -34,17 +34,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketch; -import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer; -import org.apache.datasketches.theta.ConcurrentPropagationService; -import org.apache.datasketches.theta.ConcurrentSharedThetaSketch; -import org.apache.datasketches.theta.HeapQuickSelectSketch; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.UpdateSketchBuilder; import org.testng.annotations.Test; /** @@ -173,7 +162,7 @@ public void checkHeapifyByteArrayExact() { final byte[] serArr = shared.toByteArray(); final MemorySegment srcSeg = MemorySegment.ofArray(serArr).asReadOnly(); - final Sketch recoveredShared = Sketches.heapifyUpdateSketch(srcSeg); + final Sketch recoveredShared = UpdateSketch.heapify(srcSeg); //reconstruct to Native/Direct final int bytes = Sketch.getMaxUpdateSketchBytes(k); diff --git a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java index e812ab8f2..59b6396b7 100644 --- a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java +++ b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java @@ -31,19 +31,12 @@ import static org.testng.Assert.fail; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; -import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.Intersection; -import org.apache.datasketches.theta.IntersectionImpl; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.Union; -import org.apache.datasketches.theta.UpdateSketch; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesReadOnlyException; import org.apache.datasketches.common.SketchesStateException; +import org.apache.datasketches.common.Util; import org.testng.annotations.Test; /** @@ -471,7 +464,7 @@ public void checkWrapVirginEmpty() { MemorySegment iSeg = MemorySegment.ofArray(new byte[segBytes]); inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin off-heap - inter2 = Sketches.wrapIntersection(iSeg); //virgin off-heap, identical to inter1 + inter2 = Intersection.wrap(iSeg); //virgin off-heap, identical to inter1 //both in virgin state, empty = false //note: both inter1 and inter2 are tied to the same MemorySegment, // so an intersect to one also affects the other. Don't do what I do! @@ -493,7 +486,7 @@ public void checkWrapVirginEmpty() { //test the path via toByteArray, now in a different state iSeg = MemorySegment.ofArray(inter1.toByteArray()); - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); assertTrue(inter2.hasResult()); //still true //test the compaction path @@ -514,7 +507,7 @@ public void checkWrapNullEmpty2() { final MemorySegment iSeg = MemorySegment.ofArray(segArr); inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); //both in virgin state, empty = false assertFalse(inter1.hasResult()); assertFalse(inter2.hasResult()); @@ -525,7 +518,7 @@ public void checkWrapNullEmpty2() { //remains empty = false. inter1.intersect(sk1); - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); assertTrue(inter1.hasResult()); assertTrue(inter2.hasResult()); final CompactSketch comp = inter2.getResult(true, null); @@ -579,7 +572,7 @@ public void checkBadPreambleLongs() { final MemorySegment seg = MemorySegment.ofArray(byteArray); //corrupt: seg.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, (byte) 2);//RF not used = 0 - Sketches.wrapIntersection(seg); + Intersection.wrap(seg); } @Test(expectedExceptions = SketchesArgumentException.class) @@ -596,18 +589,19 @@ public void checkBadSerVer() { final MemorySegment seg = MemorySegment.ofArray(byteArray); //corrupt: seg.set(JAVA_BYTE, SER_VER_BYTE, (byte) 2); - Sketches.wrapIntersection(seg); //throws in SetOperations + Intersection.wrap(seg); //throws in SetOperations } - @Test(expectedExceptions = ClassCastException.class) - public void checkFamilyID() { + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkIncorrectWrap() { final int k = 32; Union union; union = SetOperation.builder().setNominalEntries(k).buildUnion(); final byte[] byteArray = union.toByteArray(); final MemorySegment seg = MemorySegment.ofArray(byteArray); - Sketches.wrapIntersection(seg); + Intersection.wrap(seg); //wrong sketch Family + //Sketches.wrapIntersection(seg); } @Test @@ -637,7 +631,7 @@ public void checkWrap() { final byte[] segArr2 = inter.toByteArray(); final MemorySegment srcSeg = MemorySegment.ofArray(segArr2); - inter2 = Sketches.wrapIntersection(srcSeg); + inter2 = Intersection.wrap(srcSeg); //2nd call = valid intersecting sk2 = UpdateSketch.builder().setNominalEntries(k).build(); @@ -656,7 +650,7 @@ public void checkWrap() { final byte[] segArr3 = inter2.toByteArray(); final MemorySegment srcSeg2 = MemorySegment.ofArray(segArr3); - inter3 = Sketches.wrapIntersection(srcSeg2); + inter3 = Intersection.wrap(srcSeg2); resultComp2 = inter3.getResult(false, null); est2 = resultComp2.getEstimate(); println("Est2: "+est2); @@ -683,13 +677,13 @@ public void checkExceptionMinSize() { @Test public void checkGetResult() { final int k = 1024; - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final int segBytes = getMaxIntersectionBytes(k); final byte[] segArr = new byte[segBytes]; final MemorySegment iSeg = MemorySegment.ofArray(segArr); - final Intersection inter = Sketches.setOperationBuilder().buildIntersection(iSeg); + final Intersection inter = SetOperation.builder().buildIntersection(iSeg); inter.intersect(sk); final CompactSketch csk = inter.getResult(); assertEquals(csk.getCompactBytes(), 8); @@ -732,8 +726,8 @@ public void checkExceptions2() { public void checkOverlappedDirect() { final int k = 1 << 4; final int segBytes = 2*k*16 +PREBYTES; //plenty of room - final UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - final UpdateSketch sk2 = Sketches.updateSketchBuilder().setNominalEntries(k).build(); + final UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k).build(); + final UpdateSketch sk2 = UpdateSketch.builder().setNominalEntries(k).build(); for (int i=0; i SerVer 3 has defaultSeedHash, because seed was not given above + assertEquals(cskResult.getSeedHash(), defaultSeedHash); } - @Test + @Test //Compact Assumed Different Seed public void checkHeapifyCompactSketchAssumedDifferentSeed() { final int k = 64; final long seed = 128L; @@ -79,24 +79,25 @@ public void checkHeapifyCompactSketchAssumedDifferentSeed() { CompactSketch cskResult; //SerialVersion3 test - cskResult = Sketches.heapifyCompactSketch(cskSeg); + cskResult = CompactSketch.heapify(cskSeg); //don't check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); + assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here //SerialVersion2 test final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = Sketches.heapifyCompactSketch(sv2cskSeg); + cskResult = CompactSketch.heapify(sv2cskSeg); //don't check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); + assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here //SerialVersion1 test final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = Sketches.heapifyCompactSketch(sv1cskSeg); + cskResult = CompactSketch.heapify(sv1cskSeg); //don't check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); + //SerVer 1 -> SerVer 3 has defaultSeedHash, because seed was not given above assertEquals(cskResult.getSeedHash(), defaultSeedHash); } - @Test + @Test //Compact Given Default Seed public void checkHeapifyCompactSketchGivenDefaultSeed() { final int k = 64; final long seed = Util.DEFAULT_UPDATE_SEED; @@ -109,24 +110,25 @@ public void checkHeapifyCompactSketchGivenDefaultSeed() { CompactSketch cskResult; //SerialVersion3 test - cskResult = Sketches.heapifyCompactSketch(cskSeg, seed); + cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); + assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here //SerialVersion2 test final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = Sketches.heapifyCompactSketch(sv2cskSeg, seed); + cskResult = CompactSketch.heapify(sv2cskSeg, seed); //check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); + assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here //SerialVersion1 test final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = Sketches.heapifyCompactSketch(sv1cskSeg, seed); + cskResult = CompactSketch.heapify(sv1cskSeg, seed); //check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); + //SerVer 1 -> SerVer 3, was given seed above, so we can test for it. + assertEquals(cskResult.getSeedHash(), seedHash); //SerVer 1 -> SerVer3 has defaultSeedHash } - @Test + @Test //Compact Given Different Seed public void checkHeapifyCompactSketchGivenDifferentSeed() { final int k = 64; final long seed = 128L; @@ -139,23 +141,26 @@ public void checkHeapifyCompactSketchGivenDifferentSeed() { CompactSketch cskResult; //SerialVersion3 test - cskResult = Sketches.heapifyCompactSketch(cskSeg, seed); + cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion2 test final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = Sketches.heapifyCompactSketch(sv2cskSeg, seed); + cskResult = CompactSketch.heapify(sv2cskSeg, seed); //check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion1 test final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = Sketches.heapifyCompactSketch(sv1cskSeg, seed); + cskResult = CompactSketch.heapify(sv1cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); + //SerVer 1 -> SerVer 3, was given seed above, so we can test for it. assertEquals(cskResult.getSeedHash(), seedHash); } + //Heapify Sketch + @Test public void checkHeapifySketchAssumedDefaultSeed() { final int k = 64; @@ -169,19 +174,19 @@ public void checkHeapifySketchAssumedDefaultSeed() { CompactSketch cskResult; //SerialVersion3 test - cskResult = (CompactSketch) Sketches.heapifySketch(cskSeg); + cskResult = (CompactSketch) Sketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion2 test final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv2cskSeg); + cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion1 test final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv1cskSeg); + cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); } @@ -199,19 +204,19 @@ public void checkHeapifySketchAssumedDifferentSeed() { CompactSketch cskResult; //SerialVersion3 test - cskResult = (CompactSketch) Sketches.heapifySketch(cskSeg); + cskResult = (CompactSketch) Sketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion2 test final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv2cskSeg); + cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion1 test final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv1cskSeg); + cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), defaultSeedHash); } @@ -229,19 +234,19 @@ public void checkHeapifySketchGivenDefaultSeed() { CompactSketch cskResult; //SerialVersion3 test - cskResult = (CompactSketch) Sketches.heapifySketch(cskSeg, seed); + cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion2 test final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv2cskSeg, seed); + cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion1 test final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv1cskSeg, seed); + cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); } @@ -259,23 +264,25 @@ public void checkHeapifySketchGivenDifferentSeed() { CompactSketch cskResult; //SerialVersion3 test - cskResult = (CompactSketch) Sketches.heapifySketch(cskSeg, seed); + cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion2 test final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv2cskSeg, seed); + cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //SerialVersion1 test final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketches.heapifySketch(sv1cskSeg, seed); + cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); } + //Wrap CompactSketch + @Test public void checkWrapCompactSketchAssumedDefaultSeed() { final int k = 64; @@ -290,7 +297,7 @@ public void checkWrapCompactSketchAssumedDefaultSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = Sketches.wrapCompactSketch(offHeap); + cskResult = CompactSketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -299,7 +306,7 @@ public void checkWrapCompactSketchAssumedDefaultSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = Sketches.wrapCompactSketch(offHeap); + cskResult = CompactSketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -308,7 +315,7 @@ public void checkWrapCompactSketchAssumedDefaultSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = Sketches.wrapCompactSketch(offHeap); + cskResult = CompactSketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -329,7 +336,7 @@ public void checkWrapCompactSketchAssumedDifferentSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = Sketches.wrapCompactSketch(offHeap); + cskResult = CompactSketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -338,7 +345,7 @@ public void checkWrapCompactSketchAssumedDifferentSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = Sketches.wrapCompactSketch(offHeap); + cskResult = CompactSketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -347,7 +354,7 @@ public void checkWrapCompactSketchAssumedDifferentSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = Sketches.wrapCompactSketch(offHeap); + cskResult = CompactSketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), defaultSeedHash); assertFalse(cskResult.isOffHeap()); @@ -368,7 +375,7 @@ public void checkWrapCompactSketchGivenDefaultSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = Sketches.wrapCompactSketch(offHeap, seed); + cskResult = CompactSketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -377,7 +384,7 @@ public void checkWrapCompactSketchGivenDefaultSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = Sketches.wrapCompactSketch(offHeap, seed); + cskResult = CompactSketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -386,7 +393,7 @@ public void checkWrapCompactSketchGivenDefaultSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = Sketches.wrapCompactSketch(offHeap, seed); + cskResult = CompactSketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -407,7 +414,7 @@ public void checkWrapCompactSketchGivenDifferentSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = Sketches.wrapCompactSketch(offHeap, seed); + cskResult = CompactSketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -416,7 +423,7 @@ public void checkWrapCompactSketchGivenDifferentSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = Sketches.wrapCompactSketch(offHeap, seed); + cskResult = CompactSketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -425,13 +432,15 @@ public void checkWrapCompactSketchGivenDifferentSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = Sketches.wrapCompactSketch(offHeap, seed); + cskResult = CompactSketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); } } + //Wrap Sketch + @Test public void checkWrapSketchAssumedDefaultSeed() { final int k = 64; @@ -446,7 +455,7 @@ public void checkWrapSketchAssumedDefaultSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap); + cskResult = (CompactSketch) Sketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -455,7 +464,7 @@ public void checkWrapSketchAssumedDefaultSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap); + cskResult = (CompactSketch) Sketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -464,7 +473,7 @@ public void checkWrapSketchAssumedDefaultSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap); + cskResult = (CompactSketch) Sketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -485,7 +494,7 @@ public void checkWrapSketchAssumedDifferentSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap); + cskResult = (CompactSketch) Sketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -494,7 +503,7 @@ public void checkWrapSketchAssumedDifferentSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap); + cskResult = (CompactSketch) Sketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -503,7 +512,7 @@ public void checkWrapSketchAssumedDifferentSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap); + cskResult = (CompactSketch) Sketch.wrap(offHeap); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), defaultSeedHash); assertFalse(cskResult.isOffHeap()); @@ -524,7 +533,7 @@ public void checkWrapSketchGivenDefaultSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap, seed); + cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -533,7 +542,7 @@ public void checkWrapSketchGivenDefaultSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap, seed); + cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -542,7 +551,7 @@ public void checkWrapSketchGivenDefaultSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap, seed); + cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -563,7 +572,7 @@ public void checkWrapSketchGivenDifferentSeed() { //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap, seed); + cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); @@ -572,7 +581,7 @@ public void checkWrapSketchGivenDifferentSeed() { //SerialVersion2 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap, seed); + cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); @@ -581,7 +590,7 @@ public void checkWrapSketchGivenDifferentSeed() { //SerialVersion1 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketches.wrapSketch(offHeap, seed); + cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); assertFalse(cskResult.isOffHeap()); diff --git a/src/test/java/org/apache/datasketches/theta/IteratorTest.java b/src/test/java/org/apache/datasketches/theta/IteratorTest.java index 51e7cd9ef..343142044 100644 --- a/src/test/java/org/apache/datasketches/theta/IteratorTest.java +++ b/src/test/java/org/apache/datasketches/theta/IteratorTest.java @@ -27,7 +27,7 @@ import org.apache.datasketches.theta.CompactSketch; import org.apache.datasketches.theta.HashIterator; import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; + import org.apache.datasketches.theta.UpdateSketch; @@ -41,7 +41,7 @@ public void checkDirectCompactSketch() { int k = 16; int maxBytes = Sketch.getMaxUpdateSketchBytes(k); MemorySegment wseg = MemorySegment.ofArray(new byte[maxBytes]); - UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k).build(wseg); + UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k).build(wseg); println(sk1.getClass().getSimpleName()); for (int i = 0; i < (k/2); i++) { sk1.update(i); } HashIterator itr1 = sk1.iterator(); @@ -76,7 +76,7 @@ public void checkDirectCompactSketch() { public void checkHeapAlphaSketch() { int k = 512; int u = 8; - UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k).setFamily(Family.ALPHA) + UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k).setFamily(Family.ALPHA) .build(); println(sk1.getClass().getSimpleName()); for (int i = 0; i < u; i++) { sk1.update(i); } @@ -92,7 +92,7 @@ public void checkHeapAlphaSketch() { public void checkHeapQSSketch() { int k = 16; int u = 8; - UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k) + UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k) .build(); println(sk1.getClass().getSimpleName()); for (int i = 0; i < u; i++) { sk1.update(i); } @@ -108,7 +108,7 @@ public void checkHeapQSSketch() { public void checkSingleItemSketch() { int k = 16; int u = 1; - UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k) + UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k) .build(); for (int i = 0; i < u; i++) { sk1.update(i); } diff --git a/src/test/java/org/apache/datasketches/theta/PairwiseSetOperationsTest.java b/src/test/java/org/apache/datasketches/theta/PairwiseSetOperationsTest.java index 00cedaeec..2d1e71103 100644 --- a/src/test/java/org/apache/datasketches/theta/PairwiseSetOperationsTest.java +++ b/src/test/java/org/apache/datasketches/theta/PairwiseSetOperationsTest.java @@ -28,7 +28,6 @@ import org.apache.datasketches.theta.Intersection; import org.apache.datasketches.theta.SetOperation; import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; import org.apache.datasketches.theta.Union; import org.apache.datasketches.theta.UpdateSketch; import org.testng.annotations.Test; @@ -52,7 +51,7 @@ public void checkIntersectionNoOverlap() { CompactSketch csk1 = usk1.compact(true, null); CompactSketch csk2 = usk2.compact(true, null); - Intersection inter = Sketches.setOperationBuilder().buildIntersection(); + Intersection inter = SetOperation.builder().buildIntersection(); Sketch rsk = inter.intersect(csk1, csk2); assertEquals(rsk.getEstimate(), 0.0); } @@ -64,7 +63,7 @@ public void checkIntersectionFullOverlap() { UpdateSketch usk1 = UpdateSketch.builder().setNominalEntries(k).build(); UpdateSketch usk2 = UpdateSketch.builder().setNominalEntries(k).build(); - Intersection inter = Sketches.setOperationBuilder().buildIntersection(); + Intersection inter = SetOperation.builder().buildIntersection(); for (int i=0; i Date: Tue, 30 Sep 2025 15:12:40 -0700 Subject: [PATCH 17/26] Update GH action workflows for Java 25 --- .github/workflows/auto-jdk-matrix.yml | 18 +++++++++--------- .github/workflows/auto-os-matrix.yml | 16 ++++++++-------- .github/workflows/check_cpp_files.yml | 18 +++++++++--------- .github/workflows/codeql-analysis.yml | 16 ++++++++-------- .github/workflows/javadoc.yml | 8 ++++---- 5 files changed, 38 insertions(+), 38 deletions(-) diff --git a/.github/workflows/auto-jdk-matrix.yml b/.github/workflows/auto-jdk-matrix.yml index 11281bbdb..0d387c944 100644 --- a/.github/workflows/auto-jdk-matrix.yml +++ b/.github/workflows/auto-jdk-matrix.yml @@ -1,12 +1,12 @@ name: Auto JDK Matrix Test & Install on: -# push: -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + push: + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + # The branches below must be a subset of the branches above + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: @@ -27,7 +27,7 @@ jobs: steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: persist-credentials: false @@ -43,7 +43,7 @@ jobs: restore-keys: build-${{ runner.os }}-maven- - name: Install Matrix JDK - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.jdk }} distribution: 'temurin' @@ -67,7 +67,7 @@ jobs: -Dgpg.skip=true # Architecture options: x86, x64, armv7, aarch64, ppc64le -# setup-java@v4 has a "with cache" option +# setup-java@v5 has a "with cache" option # Lifecycle: validate, compile, test, package, verify, install, deploy # -B batch mode, never stops for user input # -V show Version without stopping diff --git a/.github/workflows/auto-os-matrix.yml b/.github/workflows/auto-os-matrix.yml index 13caa578b..df3c7135f 100644 --- a/.github/workflows/auto-os-matrix.yml +++ b/.github/workflows/auto-os-matrix.yml @@ -1,13 +1,13 @@ name: Auto OS Matrix Test & Install on: -# push: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + push: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] # The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: env: @@ -41,7 +41,7 @@ jobs: steps: - name: "Checkout ${{ github.ref }} ( ${{ github.sha }} )" - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: persist-credentials: false @@ -53,7 +53,7 @@ jobs: restore-keys: build-${{ runner.os }}-maven- - name: Install Matrix JDK - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.jdk }} distribution: 'temurin' diff --git a/.github/workflows/check_cpp_files.yml b/.github/workflows/check_cpp_files.yml index 44aff2f83..694aa139d 100644 --- a/.github/workflows/check_cpp_files.yml +++ b/.github/workflows/check_cpp_files.yml @@ -1,13 +1,13 @@ name: CPP SerDe Compatibility Test on: -# push: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + push: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] # The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: @@ -16,16 +16,16 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Checkout C++ - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: repository: apache/datasketches-cpp path: cpp - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: '25' distribution: 'temurin' diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b6c723f7c..0a6de05d9 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -1,13 +1,13 @@ name: "CodeQL" on: -# push: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] -# pull_request: -# paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + push: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + pull_request: + paths-ignore: [ '**/*.html', '**/*.md', '**/*.txt', '**/*.xml', '**/*.yaml', '**/*.yml', '**/LICENSE', '**/NOTICE' ] # The branches below must be a subset of the branches above -# branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] + branches: [ 'main', '[0-9]+.[0-9]+.[Xx]' ] workflow_dispatch: jobs: @@ -28,10 +28,10 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: distribution: 'temurin' cache: 'maven' diff --git a/.github/workflows/javadoc.yml b/.github/workflows/javadoc.yml index 1a98d91b8..66bab896a 100644 --- a/.github/workflows/javadoc.yml +++ b/.github/workflows/javadoc.yml @@ -1,8 +1,8 @@ name: JavaDoc on: -# push: -# branches: main + push: + branches: main workflow_dispatch: permissions: @@ -14,10 +14,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Setup Java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: '25' distribution: 'temurin' From 21398548cfd058b4874dd5bf511163c00ad3e37f Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 30 Sep 2025 17:01:18 -0700 Subject: [PATCH 18/26] Improve probabilistic test to make it extremely unlikely that it will fail just due to random variation in sequence. --- .../sampling/ReservoirItemsSketchTest.java | 116 +++++++++--------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java b/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java index df11869eb..18c3b6130 100644 --- a/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java @@ -45,10 +45,6 @@ import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesException; import org.apache.datasketches.common.SketchesStateException; -import org.apache.datasketches.sampling.PreambleUtil; -import org.apache.datasketches.sampling.ReservoirItemsSketch; -import org.apache.datasketches.sampling.ReservoirSize; -import org.apache.datasketches.sampling.SampleSubsetSummary; import org.testng.annotations.Test; public class ReservoirItemsSketchTest { @@ -534,60 +530,70 @@ public void checkForceIncrement() { @Test public void checkEstimateSubsetSum() { final int k = 10; - final ReservoirItemsSketch sketch = ReservoirItemsSketch.newInstance(k); - - // empty sketch -- all zeros - SampleSubsetSummary ss = sketch.estimateSubsetSum(item -> true); - assertEquals(ss.getEstimate(), 0.0); - assertEquals(ss.getTotalSketchWeight(), 0.0); - - // add items, keeping in exact mode + SampleSubsetSummary ss = null; double itemCount = 0.0; - for (long i = 1; i <= (k - 1); ++i) { - sketch.update(i); - itemCount += 1.0; - } - - ss = sketch.estimateSubsetSum(item -> true); - assertEquals(ss.getEstimate(), itemCount); - assertEquals(ss.getLowerBound(), itemCount); - assertEquals(ss.getUpperBound(), itemCount); - assertEquals(ss.getTotalSketchWeight(), itemCount); - - // add a few more items, pushing to sampling mode - for (long i = k; i <= (k + 1); ++i) { - sketch.update(i); - itemCount += 1.0; - } - - // predicate always true so estimate == upper bound - ss = sketch.estimateSubsetSum(item -> true); - assertEquals(ss.getEstimate(), itemCount); - assertEquals(ss.getUpperBound(), itemCount); - assertTrue(ss.getLowerBound() < itemCount); - assertEquals(ss.getTotalSketchWeight(), itemCount); - - // predicate always false so estimate == lower bound == 0.0 - ss = sketch.estimateSubsetSum(item -> false); - assertEquals(ss.getEstimate(), 0.0); - assertEquals(ss.getLowerBound(), 0.0); - assertTrue(ss.getUpperBound() > 0.0); - assertEquals(ss.getTotalSketchWeight(), itemCount); - - // finally, a non-degenerate predicate - // insert negative items with identical weights, filter for negative weights only - for (long i = 1; i <= (k + 1); ++i) { - sketch.update(-i); - itemCount += 1.0; - } - ss = sketch.estimateSubsetSum(item -> item < 0); - assertTrue(ss.getEstimate() >= ss.getLowerBound()); - assertTrue(ss.getEstimate() <= ss.getUpperBound()); + //trial loop for probabilistic testing + int passLB = 0; + int passUB = 0; + for (int t = 0; t < 3; t++) { + final ReservoirItemsSketch sketch = ReservoirItemsSketch.newInstance(k); + + // empty sketch -- all zeros + ss = sketch.estimateSubsetSum(item -> true); + assertEquals(ss.getEstimate(), 0.0); + assertEquals(ss.getTotalSketchWeight(), 0.0); + + // add items, keeping in exact mode + itemCount = 0.0; + for (long i = 1; i <= (k - 1); ++i) { + sketch.update(i); + itemCount += 1.0; + } + + ss = sketch.estimateSubsetSum(item -> true); + assertEquals(ss.getEstimate(), itemCount); + assertEquals(ss.getLowerBound(), itemCount); + assertEquals(ss.getUpperBound(), itemCount); + assertEquals(ss.getTotalSketchWeight(), itemCount); + + // add a few more items, pushing to sampling mode + for (long i = k; i <= (k + 1); ++i) { + sketch.update(i); + itemCount += 1.0; + } + + // predicate always true so estimate == upper bound + ss = sketch.estimateSubsetSum(item -> true); + assertEquals(ss.getEstimate(), itemCount); + assertEquals(ss.getUpperBound(), itemCount); + assertTrue(ss.getLowerBound() < itemCount); + assertEquals(ss.getTotalSketchWeight(), itemCount); + + // predicate always false so estimate == lower bound == 0.0 + ss = sketch.estimateSubsetSum(item -> false); + assertEquals(ss.getEstimate(), 0.0); + assertEquals(ss.getLowerBound(), 0.0); + assertTrue(ss.getUpperBound() > 0.0); + assertEquals(ss.getTotalSketchWeight(), itemCount); + + // finally, a non-degenerate predicate + // insert negative items with identical weights, filter for negative weights only + for (long i = 1; i <= (k + 1); ++i) { + sketch.update(-i); + itemCount += 1.0; + } + + ss = sketch.estimateSubsetSum(item -> item < 0); + assertTrue(ss.getEstimate() >= ss.getLowerBound()); + assertTrue(ss.getEstimate() <= ss.getUpperBound()); + + // allow pretty generous bounds when testing + if(ss.getLowerBound() < (itemCount / 1.4)) { passLB++; } + if(ss.getUpperBound() > (itemCount / 2.6)) { passUB++; } + } //End trial loop + assertTrue(passLB >= 2 && passUB >= 2); //2 out of 3 must pass for LB and UB - // allow pretty generous bounds when testing - assertTrue(ss.getLowerBound() < (itemCount / 1.4)); - assertTrue(ss.getUpperBound() > (itemCount / 2.6)); assertEquals(ss.getTotalSketchWeight(), itemCount); } From b5639e846577db3fde73bbee065a8862d0a1848e Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 30 Sep 2025 17:09:21 -0700 Subject: [PATCH 19/26] Slight correction in test loop. --- .../datasketches/sampling/ReservoirItemsSketchTest.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java b/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java index 18c3b6130..738b46bd0 100644 --- a/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java +++ b/src/test/java/org/apache/datasketches/sampling/ReservoirItemsSketchTest.java @@ -530,8 +530,6 @@ public void checkForceIncrement() { @Test public void checkEstimateSubsetSum() { final int k = 10; - SampleSubsetSummary ss = null; - double itemCount = 0.0; //trial loop for probabilistic testing int passLB = 0; @@ -540,12 +538,12 @@ public void checkEstimateSubsetSum() { final ReservoirItemsSketch sketch = ReservoirItemsSketch.newInstance(k); // empty sketch -- all zeros - ss = sketch.estimateSubsetSum(item -> true); + SampleSubsetSummary ss = sketch.estimateSubsetSum(item -> true); assertEquals(ss.getEstimate(), 0.0); assertEquals(ss.getTotalSketchWeight(), 0.0); // add items, keeping in exact mode - itemCount = 0.0; + double itemCount = 0.0; for (long i = 1; i <= (k - 1); ++i) { sketch.update(i); itemCount += 1.0; @@ -591,10 +589,9 @@ public void checkEstimateSubsetSum() { // allow pretty generous bounds when testing if(ss.getLowerBound() < (itemCount / 1.4)) { passLB++; } if(ss.getUpperBound() > (itemCount / 2.6)) { passUB++; } + assertEquals(ss.getTotalSketchWeight(), itemCount); } //End trial loop assertTrue(passLB >= 2 && passUB >= 2); //2 out of 3 must pass for LB and UB - - assertEquals(ss.getTotalSketchWeight(), itemCount); } private static MemorySegment getBasicSerializedLongsRIS() { From 193cfdf571348ba1eb55945919c62a39ebc6230b Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 30 Sep 2025 18:05:08 -0700 Subject: [PATCH 20/26] Completely remove SerVer 1 and 2 from the code. --- .../datasketches/theta/CompactSketch.java | 24 +- .../theta/ForwardCompatibility.java | 165 ------------ .../theta/BackwardConversions.java | 240 ----------------- .../datasketches/theta/DirectUnionTest.java | 125 --------- .../apache/datasketches/theta/EmptyTest.java | 46 ---- .../theta/ForwardCompatibilityTest.java | 219 ---------------- .../datasketches/theta/HeapUnionTest.java | 116 --------- .../theta/HeapifyWrapSerVer1and2Test.java | 245 ------------------ .../datasketches/theta/SketchMiscTest.java | 28 -- .../apache/datasketches/theta/SketchTest.java | 38 +-- .../datasketches/theta/UnionImplTest.java | 43 --- 11 files changed, 3 insertions(+), 1286 deletions(-) delete mode 100644 src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java delete mode 100644 src/test/java/org/apache/datasketches/theta/BackwardConversions.java delete mode 100644 src/test/java/org/apache/datasketches/theta/ForwardCompatibilityTest.java diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index da890d9e3..ce5861d41 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -117,15 +117,7 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed if (enforceSeed && !empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } return CompactOperations.segmentToCompact(srcSeg, srcOrdered, null); } - //not SerVer 3, assume compact stored form - final short seedHash = Util.computeSeedHash(seed); - if (serVer == 1) { - return ForwardCompatibility.heapify1to3(srcSeg, seedHash); - } - if (serVer == 2) { - return ForwardCompatibility.heapify2to3(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } + //not SerVer 3 or 4 throw new SketchesArgumentException("Unknown Serialization Version: " + serVer); } @@ -191,13 +183,6 @@ private static CompactSketch wrap(final MemorySegment srcSeg, final long seed, f final short seedHash = Util.computeSeedHash(seed); switch (serVer) { - case 1: { - return ForwardCompatibility.heapify1to3(srcSeg, seedHash); - } - case 2: { - return ForwardCompatibility.heapify2to3(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } case 3: { if (PreambleUtil.isEmptyFlag(srcSeg)) { return EmptyCompactSketch.getHeapInstance(srcSeg); @@ -294,13 +279,6 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo final short seedHash = Util.computeSeedHash(seed); switch (serVer) { - case 1: { - return ForwardCompatibility.heapify1to3(MemorySegment.ofArray(bytes), seedHash); - } - case 2: { - return ForwardCompatibility.heapify2to3(MemorySegment.ofArray(bytes), - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); - } case 3: { final int flags = bytes[FLAGS_BYTE]; if ((flags & EMPTY_FLAG_MASK) > 0) { diff --git a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java deleted file mode 100644 index ec81f7c2e..000000000 --- a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; -import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; -import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; - -import java.lang.foreign.MemorySegment; - -import org.apache.datasketches.common.SketchesArgumentException; - -/** - * Used to convert older serialization versions 1 and 2 to version 3. The Serialization - * Version is the version of the sketch binary image format and should not be confused with the - * version number of the Open Source DataSketches Library. - * - * @author Lee Rhodes - */ -final class ForwardCompatibility { - - private ForwardCompatibility() { } - - /** - * Convert a serialization version (SerVer) 1 sketch (~Feb 2014) to a SerVer 3 sketch. - * Note: SerVer 1 sketches always have (metadata) preamble-longs of 3 and are always stored - * in a compact ordered form, but with 3 different sketch types. All SerVer 1 sketches will - * be converted to a SerVer 3 sketches. There is no concept of p-sampling, no empty bit. - * - * @param srcSeg the image of a SerVer 1 sketch - * - * @param seedHash See Seed Hash. - * The seedHash that matches the seedHash of the original seed used to construct the sketch. - * Note: SerVer 1 sketches do not have the concept of the SeedHash, so the seedHash provided here - * MUST be derived from the actual seed that was used when the SerVer 1 sketches were built. - * @return a SerVer 3 {@link CompactSketch}. - */ - static final CompactSketch heapify1to3(final MemorySegment srcSeg, final short seedHash) { - final int segCap = (int) srcSeg.byteSize(); - final int preLongs = Sketch.getPreambleLongs(srcSeg); //always 3 for serVer 1 - if (preLongs != 3) { - throw new SketchesArgumentException("PreLongs must be 3 for SerVer 1: " + preLongs); - } - final int familyId = extractFamilyID(srcSeg); //1,2,3 - if ((familyId < 1) || (familyId > 3)) { - throw new SketchesArgumentException("Family ID (Sketch Type) must be 1 to 3: " + familyId); - } - final int curCount = extractCurCount(srcSeg); - final long thetaLong = extractThetaLong(srcSeg); - final boolean empty = (curCount == 0) && (thetaLong == Long.MAX_VALUE); - - if (empty || (segCap <= 24)) { //return empty - return EmptyCompactSketch.getInstance(); - } - - final int reqCap = (curCount + preLongs) << 3; - validateInputSize(reqCap, segCap); - - if ((thetaLong == Long.MAX_VALUE) && (curCount == 1)) { - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //theta < 1.0 and/or curCount > 1 - - final long[] compactOrderedCache = new long[curCount]; - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong, true); - } - - /** - * Convert a serialization version (SerVer) 2 sketch to a SerVer 3 HeapCompactOrderedSketch. - * Note: SerVer 2 sketches can have metadata-longs of 1,2 or 3 and are always stored - * in a compact ordered form (not as a hash table), but with 4 different sketch types. - * @param srcSeg the image of a SerVer 2 sketch - * @param seedHash See Seed Hash. - * The seed used for building the sketch image in srcMem - * @return a SerVer 3 HeapCompactOrderedSketch - */ - static final CompactSketch heapify2to3(final MemorySegment srcSeg, final short seedHash) { - final int segCap = (int) srcSeg.byteSize(); - final int preLongs = Sketch.getPreambleLongs(srcSeg); //1,2 or 3 - final int familyId = extractFamilyID(srcSeg); //1,2,3,4 - if ((familyId < 1) || (familyId > 4)) { - throw new SketchesArgumentException("Family (Sketch Type) must be 1 to 4: " + familyId); - } - int reqBytesIn = 8; - int curCount = 0; - long thetaLong = Long.MAX_VALUE; - if (preLongs == 1) { - reqBytesIn = 8; - validateInputSize(reqBytesIn, segCap); - return EmptyCompactSketch.getInstance(); - } - if (preLongs == 2) { //includes pre0 + count, no theta (== 1.0) - reqBytesIn = preLongs << 3; - validateInputSize(reqBytesIn, segCap); - curCount = extractCurCount(srcSeg); - if (curCount == 0) { - return EmptyCompactSketch.getInstance(); - } - if (curCount == 1) { - reqBytesIn = (preLongs + 1) << 3; - validateInputSize(reqBytesIn, segCap); - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //curCount > 1 - reqBytesIn = (curCount + preLongs) << 3; - validateInputSize(reqBytesIn, segCap); - final long[] compactOrderedCache = new long[curCount]; - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong,true); - } - if (preLongs == 3) { //pre0 + count + theta - reqBytesIn = (preLongs) << 3; // - validateInputSize(reqBytesIn, segCap); - curCount = extractCurCount(srcSeg); - thetaLong = extractThetaLong(srcSeg); - if ((curCount == 0) && (thetaLong == Long.MAX_VALUE)) { - return EmptyCompactSketch.getInstance(); - } - if ((curCount == 1) && (thetaLong == Long.MAX_VALUE)) { - reqBytesIn = (preLongs + 1) << 3; - validateInputSize(reqBytesIn, segCap); - final long hash = srcSeg.get(JAVA_LONG_UNALIGNED, preLongs << 3); - return new SingleItemSketch(hash, seedHash); - } - //curCount > 1 and/or theta < 1.0 - reqBytesIn = (curCount + preLongs) << 3; - validateInputSize(reqBytesIn, segCap); - final long[] compactOrderedCache = new long[curCount]; - //srcSeg.getLongArray(preLongs << 3, compactOrderedCache, 0, curCount); - MemorySegment.copy(srcSeg, JAVA_LONG_UNALIGNED, preLongs << 3, compactOrderedCache, 0, curCount); - return new HeapCompactSketch(compactOrderedCache, false, seedHash, curCount, thetaLong, true); - } - throw new SketchesArgumentException("PreLongs must be 1,2, or 3: " + preLongs); - } - - private static final void validateInputSize(final int reqBytesIn, final int segCap) { - if (reqBytesIn > segCap) { - throw new SketchesArgumentException( - "Input MemorySegment or byte[] size is too small: Required Bytes: " + reqBytesIn - + ", bytesIn: " + segCap); - } - } - -} diff --git a/src/test/java/org/apache/datasketches/theta/BackwardConversions.java b/src/test/java/org/apache/datasketches/theta/BackwardConversions.java deleted file mode 100644 index a0688cbba..000000000 --- a/src/test/java/org/apache/datasketches/theta/BackwardConversions.java +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED; - -import java.lang.foreign.MemorySegment; -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.EmptyCompactSketch; -import org.apache.datasketches.theta.SingleItemSketch; - -/** - * This class converts current compact sketches into prior SerVer 1 and SerVer 2 format for testing. - * - * @author Lee Rhodes - */ -public class BackwardConversions { - - /** - * Converts a SerVer3 ordered, heap CompactSketch to a SerVer1 ordered, SetSketch in MemorySegment. - * This is exclusively for testing purposes. - * - *

    V1 dates from roughly Aug 2014 to about May 2015. - * The library at that time had an early Theta sketch with set operations based on ByteBuffer, - * the Alpha sketch, and an early HLL sketch. It also had an early adaptor for Pig. - * It also had code for the even earlier CountUniqueSketch (for backward compatibility), - * which was the bucket sketch based on Giroire. - * - *

    Serialization Version 1:

    - *
    -   * Long || Start Byte Adr:
    -   * Adr:
    -   *      ||  7 |   6   |     5    |   4   |   3   |    2   |    1   |     0    |
    -   *  0   ||    | Flags | LgResize | LgArr | lgNom | SkType | SerVer | MD_LONGS |
    -   *
    -   *      || 15 |  14   |    13    |  12   |  11   |   10   |    9   |     8    |
    -   *  1   ||                               | ------------CurCount-------------- |
    -   *
    -   *      || 23 |  22   |    21    |  20   |  19   |   18   |   17   |    16    |
    -   *  2   || --------------------------THETA_LONG------------------------------ |
    -   *
    -   *      ||                                                         |    24    |
    -   *  3   || ----------------------Start of Long Array------------------------  |
    -   * 
    - * - *
      - *
    • The serialization for V1 was always to a compact form (no hash table spaces).
    • - *
    • MD_LONGS (Metadata Longs, now Preamble Longs) was always 3.
    • - *
    • SerVer is always 1.
    • - *
    • The SkType had three values: 1,2,3 for Alpha, QuickSelect, and SetSketch, - * respectively.
    • - *
    • Bytes lgNom and lgArr were only used by the QS and Alpha sketches.
    • - *
    • V1 LgResize (2 bits) was only relevant to the Alpha and QS sketches.
    • - *
    • The flags byte is in byte 6 (moved to 5 in V2).
    • - *
    • The only flag bits are BE(bit0)=0, and Read-Only(bit1)=1. Read-only was only set for the - * SetSketch.
    • - *
    • There is no seedHash.
    • - *
    • There is no concept of p-sampling so bytes 12-15 of Pre1 are empty.
    • - *
    • The determination of empty is when both curCount=0 and thetaLong = Long.MAX_VALUE.
    • - *
    - * - * @param skV3 a SerVer3, ordered CompactSketch - * @return a SerVer1 SetSketch as MemorySegment object. - */ - public static MemorySegment convertSerVer3toSerVer1(final CompactSketch skV3) { - //Check input sketch - final boolean validIn = skV3.isCompact() && skV3.isOrdered() && !skV3.hasMemorySegment(); - if (!validIn) { - throw new SketchesArgumentException("Invalid input sketch."); - } - - //Build V1 SetSketch in MemorySegment - final int curCount = skV3.getRetainedEntries(true); - final int bytes = (3 + curCount) << 3; - final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]);//Util.newHeapSegment(bytes); - //Pre0 - wseg.set(JAVA_BYTE, 0, (byte) 3); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 1); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //Compact (SetSketch) - wseg.set(JAVA_BYTE, 6, (byte) 2); //Flags ReadOnly, LittleEndian - //Pre1 - wseg.set(JAVA_INT_UNALIGNED, 8, curCount); - //Pre2 - wseg.set(JAVA_LONG_UNALIGNED, 16, skV3.getThetaLong()); - //Data - if (curCount > 0) { - MemorySegment.copy(skV3.getCache(), 0, wseg, JAVA_LONG_UNALIGNED, 24, curCount); - } - return wseg; - } - - /** - * Converts a SerVer3 ordered, heap CompactSketch to a SerVer2 ordered, SetSketch in MemorySegment. - * This is exclusively for testing purposes. - * - *

    V2 is short-lived and dates from roughly Mid May 2015 to about June 1st, 2015. - * (V3 was created about June 15th in preparation for OpenSource in July.) - * The Theta sketch had evolved but still based on ByteBuffer. There was an UpdateSketch, - * the Alpha sketch, and the early HLL sketch. It also had an early adaptor for Pig. - * - * - *

    Serialization Version 2:

    - *
    -   * Long || Start Byte Adr:
    -   * Adr:
    -   *      ||  7 |   6   |     5    |   4   |   3   |    2   |    1   |     0         |
    -   *  0   || Seed Hash  |  Flags   | lgArr | lgNom | SkType | SerVer | MD_LONGS + RR |
    -   *
    -   *      || 15 |  14   |    13    |  12   |  11   |   10   |    9   |     8         |
    -   *  1   || --------------p-------------- | ---------Retained Entries Count-------- |
    -   *
    -   *      || 23 |  22   |    21    |  20   |  19   |   18   |   17   |    16         |
    -   *  2   || --------------------------THETA_LONG----------------------------------- |
    -   *
    -   *      ||                                                         |    24         |
    -   *  3   || ----------Start of Long Array, could be at 2 or 3 --------------------  |
    -   *  
    - * - *
      - *
    • The serialization for V2 was always to a compact form (no hash table spaces).
    • - *
    • MD_LONGS low 6 bits: 1 (Empty), 2 (Exact), 3 (Estimating).
    • - *
    • SerVer is always 2.
    • - *
    • The SkType had 4 values: 1,2,3,4; see below.
    • - *
    • Bytes lgNom and lgArr were only used by the QS and Alpha sketches.
    • - *
    • V2 LgResize top 2 bits if byte 0. Only relevant to the Alpha and QS sketches.
    • - *
    • The flags byte is in byte 5.
    • - *
    • The flag bits are specified below.
    • - *
    • There is a seedHash in bytes 6-7.
    • - *
    • p-sampling is bytes 12-15 of Pre1.
    • - *
    • The determination of empty based on the sketch field empty_.
    • - *
    - *
    -   *   // Metadata byte Addresses
    -   *   private static final int METADATA_LONGS_BYTE        = 0; //low 6 bits
    -   *   private static final int LG_RESIZE_RATIO_BYTE       = 0; //upper 2 bits
    -   *   private static final int SER_VER_BYTE               = 1;
    -   *   private static final int SKETCH_TYPE_BYTE           = 2;
    -   *   private static final int LG_NOM_LONGS_BYTE          = 3;
    -   *   private static final int LG_ARR_LONGS_BYTE          = 4;
    -   *   private static final int FLAGS_BYTE                 = 5;
    -   *   private static final int SEED_HASH_SHORT            = 6;  //byte 6,7
    -   *   private static final int RETAINED_ENTRIES_COUNT_INT = 8;  //4 byte aligned
    -   *   private static final int P_FLOAT                    = 12; //4 byte aligned
    -   *   private static final int THETA_LONG                 = 16; //8-byte aligned
    -   *   //Backward compatibility
    -   *   private static final int FLAGS_BYTE_V1              = 6;
    -   *   private static final int LG_RESIZE_RATIO_BYTE_V1    = 5;
    -   *
    -   *   // Constant Values
    -   *   static final int SER_VER                        = 2;
    -   *   static final int ALPHA_SKETCH                   = 1; //SKETCH_TYPE_BYTE
    -   *   static final int QUICK_SELECT_SKETCH            = 2;
    -   *   static final int SET_SKETCH                     = 3;
    -   *   static final int BUFFERED_QUICK_SELECT_SKETCH   = 4;
    -   *   static final String[] SKETCH_TYPE_STR     =
    -   *       { "None", "AlphaSketch", "QuickSelectSketch", "SetSketch", "BufferedQuickSelectSketch" };
    -   *
    -   *   // flag bit masks
    -   *   static final int BIG_ENDIAN_FLAG_MASK     = 1;
    -   *   static final int READ_ONLY_FLAG_MASK      = 2;
    -   *   static final int EMPTY_FLAG_MASK          = 4;
    -   *   static final int NO_REBUILD_FLAG_MASK     = 8;
    -   *   static final int UNORDERED_FLAG_MASK     = 16;
    -   * 
    - * - * @param skV3 a SerVer3, ordered CompactSketch - * @param seed used for checking the seed hash (if one exists). - * @return a SerVer2 SetSketch as MemorySegment object. - */ - public static MemorySegment convertSerVer3toSerVer2(final CompactSketch skV3, final long seed) { - final short seedHash = Util.computeSeedHash(seed); - MemorySegment wseg = null; - - if (skV3 instanceof EmptyCompactSketch) { - wseg = MemorySegment.ofArray(new long[1]); - wseg.set(JAVA_BYTE, 0, (byte) 1); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch - final byte flags = (byte) 0xE; //NoRebuild, Empty, ReadOnly, LE - wseg.set(JAVA_BYTE, 5, flags); - wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash); - return wseg; - } - if (skV3 instanceof SingleItemSketch) { - final SingleItemSketch sis = (SingleItemSketch) skV3; - wseg = MemorySegment.ofArray(new long[3]); - wseg.set(JAVA_BYTE, 0, (byte) 2); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch - final byte flags = (byte) 0xA; //NoRebuild, notEmpty, ReadOnly, LE - wseg.set(JAVA_BYTE, 5, flags); - wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash); - wseg.set(JAVA_INT_UNALIGNED, 8, 1); - final long[] arr = sis.getCache(); - wseg.set(JAVA_LONG_UNALIGNED, 16, arr[0]); - return wseg; - } - //General CompactSketch - final int preLongs = skV3.getCompactPreambleLongs(); - final int entries = skV3.getRetainedEntries(true); - final boolean unordered = !(skV3.isOrdered()); - final byte flags = (byte) (0xA | (unordered ? 16 : 0)); //Unordered, NoRebuild, notEmpty, ReadOnly, LE - wseg = MemorySegment.ofArray(new byte[(preLongs + entries) << 3]); - wseg.set(JAVA_BYTE, 0, (byte) preLongs); //preLongs - wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer - wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch - - wseg.set(JAVA_BYTE, 5, flags); - wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash); - wseg.set(JAVA_INT_UNALIGNED, 8, entries); - if (preLongs == 3) { - wseg.set(JAVA_LONG_UNALIGNED, 16, skV3.getThetaLong()); - } - final long[] arr = skV3.getCache(); - MemorySegment.copy(arr, 0, wseg, JAVA_LONG_UNALIGNED, preLongs << 3, entries); - return wseg; - } -} diff --git a/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java b/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java index c0bcbb0e4..e83651aed 100644 --- a/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java +++ b/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java @@ -21,8 +21,6 @@ import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.nio.charset.StandardCharsets.UTF_8; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer1; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer2; import static org.apache.datasketches.theta.HeapUnionTest.testAllCompactForms; import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; import static org.apache.datasketches.theta.SetOperation.getMaxUnionBytes; @@ -32,7 +30,6 @@ import java.lang.foreign.MemorySegment; import java.nio.ByteBuffer; -import java.util.Arrays; import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; @@ -451,105 +448,6 @@ public void checkDirectSegmentIn() { assertEquals(cOut.getEstimate(), totU, .05*k); } - @Test - public void checkSerVer1Handling() { - final int lgK = 12; //4096 - final int k = 1 << lgK; - final int u1 = 2*k; - final int u2 = 1024; //smaller exact sketch forces early stop - final int totU = u1+u2; - - final UpdateSketch usk1 = UpdateSketch.builder().setNominalEntries(k).build(); - final UpdateSketch usk2 = UpdateSketch.builder().setNominalEntries(k).build(); - - for (int i=0; i SerVer 3 has defaultSeedHash, because seed was not given above - assertEquals(cskResult.getSeedHash(), defaultSeedHash); } @Test //Compact Assumed Different Seed @@ -82,19 +68,6 @@ public void checkHeapifyCompactSketchAssumedDifferentSeed() { cskResult = CompactSketch.heapify(cskSeg); //don't check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - - //SerialVersion2 test - final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = CompactSketch.heapify(sv2cskSeg); //don't check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - - //SerialVersion1 test - final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = CompactSketch.heapify(sv1cskSeg); //don't check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - //SerVer 1 -> SerVer 3 has defaultSeedHash, because seed was not given above - assertEquals(cskResult.getSeedHash(), defaultSeedHash); } @Test //Compact Given Default Seed @@ -113,19 +86,6 @@ public void checkHeapifyCompactSketchGivenDefaultSeed() { cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - - //SerialVersion2 test - final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = CompactSketch.heapify(sv2cskSeg, seed); //check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - - //SerialVersion1 test - final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = CompactSketch.heapify(sv1cskSeg, seed); //check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - //SerVer 1 -> SerVer 3, was given seed above, so we can test for it. - assertEquals(cskResult.getSeedHash(), seedHash); //SerVer 1 -> SerVer3 has defaultSeedHash } @Test //Compact Given Different Seed @@ -144,19 +104,6 @@ public void checkHeapifyCompactSketchGivenDifferentSeed() { cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion2 test - final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = CompactSketch.heapify(sv2cskSeg, seed); //check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion1 test - final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = CompactSketch.heapify(sv1cskSeg, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - //SerVer 1 -> SerVer 3, was given seed above, so we can test for it. - assertEquals(cskResult.getSeedHash(), seedHash); } //Heapify Sketch @@ -177,18 +124,6 @@ public void checkHeapifySketchAssumedDefaultSeed() { cskResult = (CompactSketch) Sketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion2 test - final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion1 test - final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); } @Test @@ -207,18 +142,6 @@ public void checkHeapifySketchAssumedDifferentSeed() { cskResult = (CompactSketch) Sketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion2 test - final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion1 test - final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), defaultSeedHash); } @Test @@ -237,18 +160,6 @@ public void checkHeapifySketchGivenDefaultSeed() { cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion2 test - final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion1 test - final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); } @Test @@ -267,18 +178,6 @@ public void checkHeapifySketchGivenDifferentSeed() { cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion2 test - final MemorySegment sv2cskSeg = BackwardConversions.convertSerVer3toSerVer2(csk, seed).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv2cskSeg, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - - //SerialVersion1 test - final MemorySegment sv1cskSeg = BackwardConversions.convertSerVer3toSerVer1(csk).asReadOnly(); - cskResult = (CompactSketch) Sketch.heapify(sv1cskSeg, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); } //Wrap CompactSketch @@ -302,24 +201,6 @@ public void checkWrapCompactSketchAssumedDefaultSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = CompactSketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = CompactSketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } } @Test @@ -341,24 +222,6 @@ public void checkWrapCompactSketchAssumedDifferentSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = CompactSketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = CompactSketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), defaultSeedHash); - assertFalse(cskResult.isOffHeap()); - } } @Test @@ -380,24 +243,6 @@ public void checkWrapCompactSketchGivenDefaultSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = CompactSketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = CompactSketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } } @Test @@ -419,24 +264,6 @@ public void checkWrapCompactSketchGivenDifferentSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = CompactSketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = CompactSketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } } //Wrap Sketch @@ -460,24 +287,6 @@ public void checkWrapSketchAssumedDefaultSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } } @Test @@ -499,24 +308,6 @@ public void checkWrapSketchAssumedDifferentSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), defaultSeedHash); - assertFalse(cskResult.isOffHeap()); - } } @Test @@ -538,24 +329,6 @@ public void checkWrapSketchGivenDefaultSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } } @Test @@ -577,24 +350,6 @@ public void checkWrapSketchGivenDifferentSeed() { assertEquals(cskResult.getSeedHash(), seedHash); assertTrue(cskResult.isOffHeap()); } - - //SerialVersion2 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer2(csk, seed), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } - - //SerialVersion1 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(BackwardConversions.convertSerVer3toSerVer1(csk), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertFalse(cskResult.isOffHeap()); - } } private static MemorySegment putOffHeap(final MemorySegment heapSeg, final Arena arena) { diff --git a/src/test/java/org/apache/datasketches/theta/SketchMiscTest.java b/src/test/java/org/apache/datasketches/theta/SketchMiscTest.java index 5035433a5..e46fd20e6 100644 --- a/src/test/java/org/apache/datasketches/theta/SketchMiscTest.java +++ b/src/test/java/org/apache/datasketches/theta/SketchMiscTest.java @@ -20,8 +20,6 @@ package org.apache.datasketches.theta; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; -import static org.testng.Assert.assertTrue; import java.lang.foreign.MemorySegment; @@ -141,32 +139,6 @@ public void checkUtilMethods() { assertEquals(24+2*k*8, maxSkBytes); } - @Test - public void checkStaticEstimators() { - final int k = 4096; - final int u = 4*k; - final CompactSketch csk = getCompactSketch(k, 0, u); - final MemorySegment srcSeg = getMemorySegmentFromCompactSketch(csk); - final double est = Sketch.getEstimate(srcSeg); - assertEquals(est, u, 0.05*u); - final double rse = 1.0/Math.sqrt(k); - final double ub = Sketch.getUpperBound(1, srcSeg); - assertEquals(ub, est+rse, 0.05*u); - final double lb = Sketch.getLowerBound(1, srcSeg); - assertEquals(lb, est-rse, 0.05*u); - final MemorySegment segV1 = BackwardConversions.convertSerVer3toSerVer1(csk); - boolean empty = Sketch.getEmpty(segV1); - assertFalse(empty); - - final CompactSketch csk2 = getCompactSketch(k, 0, 0); - final MemorySegment emptySegV3 = getMemorySegmentFromCompactSketch(csk2); - assertEquals(Sketch.getRetainedEntries(emptySegV3), 0); - assertEquals(Sketch.getThetaLong(emptySegV3), Long.MAX_VALUE); - final MemorySegment emptySegV1 = BackwardConversions.convertSerVer3toSerVer1(csk2); - empty = Sketch.getEmpty(emptySegV1); - assertTrue(empty); - } - @Test(expectedExceptions = SketchesArgumentException.class) public void checkBadSketchFamily() { final Union union = SetOperation.builder().buildUnion(); diff --git a/src/test/java/org/apache/datasketches/theta/SketchTest.java b/src/test/java/org/apache/datasketches/theta/SketchTest.java index e11aaa079..cffee0b08 100644 --- a/src/test/java/org/apache/datasketches/theta/SketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/SketchTest.java @@ -26,32 +26,24 @@ import static org.apache.datasketches.common.ResizeFactor.X2; import static org.apache.datasketches.common.ResizeFactor.X4; import static org.apache.datasketches.common.ResizeFactor.X8; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer1; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer2; +import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.theta.CompactOperations.computeCompactPreLongs; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; import static org.apache.datasketches.theta.Sketch.getMaxCompactSketchBytes; -import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; import org.apache.datasketches.common.MemorySegmentStatus; import org.apache.datasketches.common.ResizeFactor; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.DirectCompactSketch; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Union; -import org.apache.datasketches.theta.UpdateSketch; import org.apache.datasketches.thetacommon.ThetaUtil; import org.testng.annotations.Test; @@ -266,7 +258,6 @@ public void checkWrapAlphaCompactExcep() { //corrupt: Util.setBits(seg, FLAGS_BYTE, (byte) COMPACT_FLAG_MASK); Sketch.wrap(seg); - } @Test(expectedExceptions = SketchesArgumentException.class) @@ -301,31 +292,6 @@ public void checkValidSketchID() { assertTrue(Sketch.isValidSketchID(COMPACT.getID())); } - @Test - public void checkWrapToHeapifyConversion1() { - final int k = 512; - final UpdateSketch sketch1 = UpdateSketch.builder().setNominalEntries(k).build(); - for (int i = 0; i < k; i++) { - sketch1.update(i); - } - final double uest1 = sketch1.getEstimate(); - - final CompactSketch csk = sketch1.compact(); - assertEquals(csk.getEstimate(), uest1); - - final MemorySegment v1seg = convertSerVer3toSerVer1(csk); - Sketch csk2 = Sketch.wrap(v1seg); //fails - assertFalse(csk2.isOffHeap()); - assertFalse(csk2.hasMemorySegment()); - assertEquals(uest1, csk2.getEstimate(), 0.0); - - final MemorySegment v2seg = convertSerVer3toSerVer2(csk, Util.DEFAULT_UPDATE_SEED); - csk2 = Sketch.wrap(v2seg); - assertFalse(csk2.isOffHeap()); - assertFalse(csk2.hasMemorySegment()); - assertEquals(uest1, csk2.getEstimate(), 0.0); - } - @Test public void checkIsSameResource() { final int k = 16; diff --git a/src/test/java/org/apache/datasketches/theta/UnionImplTest.java b/src/test/java/org/apache/datasketches/theta/UnionImplTest.java index 627fe097d..1cc7c76e7 100644 --- a/src/test/java/org/apache/datasketches/theta/UnionImplTest.java +++ b/src/test/java/org/apache/datasketches/theta/UnionImplTest.java @@ -20,8 +20,6 @@ package org.apache.datasketches.theta; import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer1; -import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer2; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; @@ -148,47 +146,6 @@ public void checkCorruptFamilyException() { union.union(seg); } - @Test(expectedExceptions = SketchesArgumentException.class) - public void checkVer2FamilyException() { - final int k = 16; - final UpdateSketch sketch = UpdateSketch.builder().setNominalEntries(k).build(); - for (int i=0; i Date: Wed, 1 Oct 2025 13:13:56 -0700 Subject: [PATCH 21/26] More cleanup after removing SerVer 1 and 2 --- .../datasketches/theta/CompactSketch.java | 161 +++++++--------- .../datasketches/theta/PreambleUtil.java | 45 ++--- .../org/apache/datasketches/theta/Sketch.java | 60 ++---- ...2Test.java => HeapifyWrapSerVer3Test.java} | 180 +----------------- .../datasketches/theta/PreambleUtilTest.java | 10 - 5 files changed, 105 insertions(+), 351 deletions(-) rename src/test/java/org/apache/datasketches/theta/{HeapifyWrapSerVer1and2Test.java => HeapifyWrapSerVer3Test.java} (51%) diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index ce5861d41..fb7577deb 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -68,17 +68,13 @@ public abstract class CompactSketch extends Sketch { *

    The resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.

    * - *

    This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * The resulting on-heap CompactSketch will be given the seedHash derived from the given sketch image. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.

    + *

    The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.

    * * @param srcSeg an image of a CompactSketch. * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg) { - //final boolean checkSeedHash = extractSerVer(srcSeg) != 1; - return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED, false); //false for SerVer 1 only + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -87,9 +83,7 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { *

    The resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.

    * - *

    This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketch images cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.

    + *

    This method checks if the given expectedSeed was used to create the source MemorySegment image.

    * * @param srcSeg an image of a CompactSketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -97,10 +91,6 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - return heapify(srcSeg, expectedSeed, true); - } - - private static CompactSketch heapify(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = idToFamily(familyID); @@ -108,17 +98,18 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } if (serVer == 4) { - return heapifyV4(srcSeg, seed, enforceSeed); + return heapifyV4(srcSeg, expectedSeed); } if (serVer == 3) { final int flags = extractFlags(srcSeg); final boolean srcOrdered = (flags & ORDERED_FLAG_MASK) != 0; final boolean empty = (flags & EMPTY_FLAG_MASK) != 0; - if (enforceSeed && !empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + if (!empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, expectedSeed); } return CompactOperations.segmentToCompact(srcSeg, srcOrdered, null); } //not SerVer 3 or 4 - throw new SketchesArgumentException("Unknown Serialization Version: " + serVer); + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** @@ -126,24 +117,17 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

    - * *

    Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

    * - *

    This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting on-heap CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.

    + *

    The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.

    * * @param srcSeg an image of a Sketch. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg) { - return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED, false); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -151,69 +135,54 @@ public static CompactSketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

    - * *

    Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

    * - *

    This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.

    + *

    This method checks if the given expectedSeed was used to create the source MemorySegment image.

    * * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg, final long expectedSeed) { - return wrap(srcSeg, expectedSeed, true); - } - - private static CompactSketch wrap(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = Family.idToFamily(familyID); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); + final short seedHash = Util.computeSeedHash(expectedSeed); + - switch (serVer) { - case 3: { - if (PreambleUtil.isEmptyFlag(srcSeg)) { - return EmptyCompactSketch.getHeapInstance(srcSeg); - } - if (otherCheckForSingleItem(srcSeg)) { - return SingleItemSketch.heapify(srcSeg, enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } - //not empty & not singleItem - final int flags = extractFlags(srcSeg); - final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; - if (!compactFlag) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have compact flag set"); - } - final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; - if (!readOnly) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have Read-Only flag set"); - } - return DirectCompactSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + if (serVer == 3) { + if (PreambleUtil.isEmptyFlag(srcSeg)) { + return EmptyCompactSketch.getHeapInstance(srcSeg); + } + if (otherCheckForSingleItem(srcSeg)) { + return SingleItemSketch.heapify(srcSeg, seedHash); } - case 4: { - return DirectCompactCompressedSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + //not empty & not singleItem + final int flags = extractFlags(srcSeg); + final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; + if (!compactFlag) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have compact flag set"); } - default: { + final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; + if (!readOnly) { throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } + return DirectCompactSketch.wrapInstance(srcSeg, seedHash); } + if (serVer == 4) { + return DirectCompactCompressedSketch.wrapInstance(srcSeg, seedHash); + } + //not SerVer 3 or 4 + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** @@ -278,38 +247,38 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo } final short seedHash = Util.computeSeedHash(seed); - switch (serVer) { - case 3: { - final int flags = bytes[FLAGS_BYTE]; - if ((flags & EMPTY_FLAG_MASK) > 0) { - return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); - } - final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); - } - //not empty & not singleItem - final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; - if (!compactFlag) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have compact flag set"); - } - final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; - if (!readOnly) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have Read-Only flag set"); - } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + + if (serVer == 3) { + final int flags = bytes[FLAGS_BYTE]; + if ((flags & EMPTY_FLAG_MASK) > 0) { + return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); + } + final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; + if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); } - case 4: { - return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); + //not empty & not singleItem + final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; + if (!compactFlag) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have compact flag set"); } - default: { + final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; + if (!readOnly) { throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } + return WrappedCompactSketch.wrapInstance(bytes, + enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + } + if (serVer ==4) { + return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); } + //not SerVer 3 or 4 + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); + + } //Sketch Overrides @@ -436,12 +405,12 @@ private byte[] toByteArrayV4() { return bytes; } - private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { + private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed) { final int preLongs = Sketch.getPreambleLongs(srcSeg); final int entryBits = extractEntryBitsV4(srcSeg); final int numEntriesBytes = extractNumEntriesBytesV4(srcSeg); final short seedHash = (short) extractSeedHash(srcSeg); - if (enforceSeed) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + PreambleUtil.checkSegmentSeedHash(srcSeg, seed); int offsetBytes = 8; long theta = Long.MAX_VALUE; if (preLongs > 1) { diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index 19dec2061..a95ebaaf6 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -190,10 +190,10 @@ private PreambleUtil() {} // ###### DO NOT MESS WITH THIS FROM HERE ... // Preamble byte Addresses - static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte. - static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte. Not used by compact, direct + static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte 0. + static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte 0. Used by Update, Alpha, not used by compact, direct static final int SER_VER_BYTE = 1; - static final int FAMILY_BYTE = 2; //SerVer1,2 was SKETCH_TYPE_BYTE + static final int FAMILY_BYTE = 2; static final int LG_NOM_LONGS_BYTE = 3; //not used by compact static final int LG_ARR_LONGS_BYTE = 4; //not used by compact static final int FLAGS_BYTE = 5; @@ -203,28 +203,23 @@ private PreambleUtil() {} static final int THETA_LONG = 16; //8-byte aligned static final int UNION_THETA_LONG = 24; //8-byte aligned, only used by Union - // flag bit masks - static final int RESERVED_FLAG_MASK = 1; //SerVer 1, 2, 3. Now Reserved, no longer used. - static final int READ_ONLY_FLAG_MASK = 2; //Set but not read. Reserved. SerVer 1, 2, 3 - static final int EMPTY_FLAG_MASK = 4; //SerVer 2, 3 - static final int COMPACT_FLAG_MASK = 8; //SerVer 2 was NO_REBUILD_FLAG_MASK, 3 - static final int ORDERED_FLAG_MASK = 16;//SerVer 2 was UNORDERED_FLAG_MASK, 3 - static final int SINGLEITEM_FLAG_MASK = 32;//SerVer 3 - //The last 2 bits of the flags byte are reserved and assumed to be zero, for now. - - //Backward compatibility: SerVer1 preamble always 3 longs, SerVer2 preamble: 1, 2, 3 longs - // SKETCH_TYPE_BYTE 2 //SerVer1, SerVer2 - // V1, V2 types: Alpha = 1, QuickSelect = 2, SetSketch = 3; V3 only: Buffered QS = 4 - static final int LG_RESIZE_RATIO_BYTE_V1 = 5; //used by SerVer 1 - static final int FLAGS_BYTE_V1 = 6; //used by SerVer 1 + // flag byte bit masks + static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. + static final int READ_ONLY_FLAG_MASK = 2; //Bit 1: Reserved, Set but not read. + static final int EMPTY_FLAG_MASK = 4; //Bit 2: + static final int COMPACT_FLAG_MASK = 8; //Bit 3: + static final int ORDERED_FLAG_MASK = 16;//Bit 4: + static final int SINGLEITEM_FLAG_MASK = 32;//Bit 5: + //The last 2 bits (Bit 6,7) of the flags byte are reserved and assumed to be zero. //Other constants static final int SER_VER = 3; + static final int SER_VER_COMPRESSED = 4; // serial version 4 compressed ordered sketch, not empty, not single item - static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes - static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries - static final int THETA_LONG_V4 = 8; //8-byte aligned + static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes + static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries + static final int THETA_LONG_V4 = 8; //8-byte aligned /** * Computes the number of bytes required for an updatable sketch using a hash-table cache. @@ -377,17 +372,13 @@ else if (preLongs == 3) { //@formatter:on static int extractPreLongs(final MemorySegment seg) { - return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //for SerVer 1,2,3 + return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; } static int extractLgResizeFactor(final MemorySegment seg) { return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; } - static int extractLgResizeRatioV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, LG_RESIZE_RATIO_BYTE_V1) & 0X3; - } - static int extractSerVer(final MemorySegment seg) { return seg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; } @@ -408,10 +399,6 @@ static int extractFlags(final MemorySegment seg) { return seg.get(JAVA_BYTE, FLAGS_BYTE) & 0XFF; } - static int extractFlagsV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, FLAGS_BYTE_V1) & 0XFF; - } - static int extractSeedHash(final MemorySegment seg) { return seg.get(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT) & 0XFFFF; } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index bc944478d..1e9c65aa0 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -78,12 +78,7 @@ public abstract class Sketch implements MemorySegmentStatus { * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg) { -// return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); - final int familyID = extractFamilyID(srcSeg); - if (familyID == Family.COMPACT.getID()) { - return CompactSketch.heapify(srcSeg);//, Util.DEFAULT_UPDATE_SEED); - } - return heapifyUpdateSketchFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED); + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -97,8 +92,6 @@ public static Sketch heapify(final MemorySegment srcSeg) { *

    For Compact Sketches this method assumes that the sketch image was created with the * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.

    * - *

    Note: This assumes only SerVer 3 and later.

    - * * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. @@ -119,7 +112,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *

    Only "Direct" sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch * where all data will be copied to the heap. These early versions were never designed to "wrap".

    @@ -128,34 +121,15 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

    * - *

    For Update Sketches this method checks if the + *

    This method checks if the * Default Update Seed

    - * was used to create the source MemorySegment image. - * - *

    For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked.

    + * was used to create the source MemorySegment image.

    * * @param srcSeg a MemorySegment with an image of a Sketch. * @return a read-only Sketch backed by the given MemorySegment */ public static Sketch wrap(final MemorySegment srcSeg) { - final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family == Family.QUICKSELECT) { - if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, Util.DEFAULT_UPDATE_SEED); - } else { - throw new SketchesArgumentException( - "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3"); - } - } - if (family == Family.COMPACT) { - return CompactSketch.wrap(srcSeg); - } - throw new SketchesArgumentException( - "Cannot wrap family: " + family + " as a Sketch"); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -163,7 +137,7 @@ public static Sketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *

    Only "Direct" sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch * where all data will be copied to the heap. These early versions were never designed to "wrap".

    @@ -172,12 +146,8 @@ public static Sketch wrap(final MemorySegment srcSeg) { * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

    * - *

    For Update Sketches this method checks if the - * Default Update Seed

    - * was used to create the source MemorySegment image. - * - *

    For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked.

    + *

    This method checks if the given expectedSeed + * was used to create the source MemorySegment image.

    * * @param srcSeg a MemorySegment with an image of a Sketch. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -388,14 +358,14 @@ public static int getRetainedEntries(final MemorySegment srcSeg) { } return entries; } - //SerVer 2 or 3 + final int preLongs = Sketch.getPreambleLongs(srcSeg); - final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 + final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; if (preLongs == 1) { return empty ? 0 : 1; } //preLongs > 1 - return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 + return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } /** @@ -658,16 +628,16 @@ static boolean getEmpty(final MemorySegment srcSeg) { if (serVer == 1) { return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; } - return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2,3,4 + return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; } static int getPreambleLongs(final MemorySegment srcSeg) { - return getAndCheckPreLongs(srcSeg); //for SerVer 1,2,3,4 + return getAndCheckPreLongs(srcSeg); } static long getThetaLong(final MemorySegment srcSeg) { final int preLongs = Sketch.getPreambleLongs(srcSeg); - return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3,4 + return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); } /** @@ -725,7 +695,7 @@ static final double upperBound(final int curCount, final long thetaLong, final i } /** - * Instantiates a Heap Update Sketch from MemorySegment. Only SerVer3. SerVer 1 & 2 already handled. + * Instantiates a Heap Update Sketch from MemorySegment. * @param srcSeg the source MemorySegment * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. diff --git a/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer1and2Test.java b/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer3Test.java similarity index 51% rename from src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer1and2Test.java rename to src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer3Test.java index 68e1d04c6..b1dba552c 100644 --- a/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer1and2Test.java +++ b/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer3Test.java @@ -29,7 +29,7 @@ import org.testng.annotations.Test; @SuppressWarnings("resource") -public class HeapifyWrapSerVer1and2Test { +public class HeapifyWrapSerVer3Test { private static final short defaultSeedHash = Util.computeSeedHash(Util.DEFAULT_UPDATE_SEED); //Heapify CompactSketch @@ -46,50 +46,13 @@ public void checkHeapifyCompactSketchAssumedDefaultSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg); //don't check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - } - - @Test //Compact Assumed Different Seed - public void checkHeapifyCompactSketchAssumedDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg); //don't check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - } - - @Test //Compact Given Default Seed - public void checkHeapifyCompactSketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here + cskResult = CompactSketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here + assertEquals(cskResult.getSeedHash(), seedHash); } - @Test //Compact Given Different Seed - public void checkHeapifyCompactSketchGivenDifferentSeed() { + @Test + public void checkHeapifyCompactSketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); @@ -100,8 +63,7 @@ public void checkHeapifyCompactSketchGivenDifferentSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here + cskResult = CompactSketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); } @@ -120,14 +82,13 @@ public void checkHeapifySketchAssumedDefaultSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test cskResult = (CompactSketch) Sketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); } @Test - public void checkHeapifySketchAssumedDifferentSeed() { + public void checkHeapifySketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); @@ -138,43 +99,6 @@ public void checkHeapifySketchAssumedDifferentSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test - cskResult = (CompactSketch) Sketch.heapify(cskSeg); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - } - - @Test - public void checkHeapifySketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test - cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - } - - @Test - public void checkHeapifySketchGivenDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); @@ -193,28 +117,6 @@ public void checkWrapCompactSketchAssumedDefaultSeed() { MemorySegment offHeap; final CompactSketch csk = usk.compact(); - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = CompactSketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapCompactSketchAssumedDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); cskResult = CompactSketch.wrap(offHeap); @@ -225,28 +127,7 @@ public void checkWrapCompactSketchAssumedDifferentSeed() { } @Test - public void checkWrapCompactSketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = CompactSketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapCompactSketchGivenDifferentSeed() { + public void checkWrapCompactSketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); @@ -256,7 +137,6 @@ public void checkWrapCompactSketchGivenDifferentSeed() { MemorySegment offHeap; final CompactSketch csk = usk.compact(); - //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); cskResult = CompactSketch.wrap(offHeap, seed); @@ -290,49 +170,7 @@ public void checkWrapSketchAssumedDefaultSeed() { } @Test - public void checkWrapSketchAssumedDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapSketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapSketchGivenDifferentSeed() { + public void checkWrapSketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); diff --git a/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java b/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java index f88b39185..61093c2a5 100644 --- a/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java +++ b/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java @@ -23,11 +23,9 @@ import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractFlagsV1; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor; -import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeRatioV1; import static org.apache.datasketches.theta.PreambleUtil.extractP; import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; @@ -59,12 +57,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Union; -import org.apache.datasketches.theta.UpdateSketch; import org.testng.Assert; import org.testng.annotations.Test; @@ -191,12 +183,10 @@ public void checkInsertsAndExtracts() { insertFlags(wseg, 3); assertEquals(extractFlags(wseg), 3); - assertEquals(extractLgResizeRatioV1(wseg), 3); //also at byte 5, limited to 2 bits insertFlags(wseg, 0); insertSeedHash(wseg, ++v); assertEquals(extractSeedHash(wseg), v); - assertEquals(extractFlagsV1(wseg), v); //also at byte 6 insertSeedHash(wseg, 0); insertCurCount(wseg, ++v); From 5c3b84581ca9edbed5091315c739b9151bc1a34d Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 1 Oct 2025 13:23:40 -0700 Subject: [PATCH 22/26] Fix Javadoc warning. --- src/main/java/org/apache/datasketches/theta/Sketch.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 1e9c65aa0..33593e430 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -122,7 +122,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * This is actually faster and consumes less overall space.

    * *

    This method checks if the - * Default Update Seed

    + * Default Update Seed * was used to create the source MemorySegment image.

    * * @param srcSeg a MemorySegment with an image of a Sketch. From 8c66f20720f1657b9188a6b969ccccb1e1af1af1 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 2 Oct 2025 14:15:34 -0700 Subject: [PATCH 23/26] Lots of cleanup after Sketches class was removed. Fixed some very bad assumptions with heapify(...) and wrap(...) where if the simple call was made, i.e., heapify(seg) not only was the default update seed assumed (this is normal), but the seed hash was not checked! Yikes! Same thing with wrap(...). I'm not sure how that ever happened, but it is now fixed. I think a lot of that was associated with the old SerVer 1 and 2 stuff. This also fixed the squirrelly testing that somehow figured that the above was ok!?! Also added some checks when reading the preamble of a user's input segment -- made sure that the segment was at least large enough to hold the full preamble. --- .../datasketches/theta/CompactOperations.java | 7 +- .../datasketches/theta/CompactSketch.java | 42 ++---- .../theta/DirectCompactCompressedSketch.java | 2 +- .../theta/DirectCompactSketch.java | 10 +- .../theta/DirectQuickSelectSketch.java | 2 +- .../theta/DirectQuickSelectSketchR.java | 9 +- .../theta/EmptyCompactSketch.java | 2 +- .../datasketches/theta/HeapAlphaSketch.java | 2 +- .../datasketches/theta/HeapCompactSketch.java | 2 +- .../theta/HeapQuickSelectSketch.java | 4 +- .../datasketches/theta/PreambleUtil.java | 29 ++-- .../datasketches/theta/SingleItemSketch.java | 36 ++--- .../org/apache/datasketches/theta/Sketch.java | 131 ++++++------------ .../datasketches/theta/UpdateSketch.java | 33 ++--- .../theta/WrappedCompactCompressedSketch.java | 2 +- .../theta/WrappedCompactSketch.java | 2 +- 16 files changed, 127 insertions(+), 188 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/CompactOperations.java b/src/main/java/org/apache/datasketches/theta/CompactOperations.java index 9d23f7263..9fb917b24 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactOperations.java +++ b/src/main/java/org/apache/datasketches/theta/CompactOperations.java @@ -29,6 +29,7 @@ import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.SER_VER; import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; @@ -121,7 +122,7 @@ static CompactSketch segmentToCompact( final MemorySegment dstWSeg) { //extract Pre0 fields and Flags from srcMem - final int srcPreLongs = Sketch.getPreambleLongs(srcSeg); + final int srcPreLongs = checkSegPreambleCap(srcSeg); final int srcSerVer = extractSerVer(srcSeg); //not used final int srcFamId = extractFamilyID(srcSeg); final int srcLgArrLongs = extractLgArrLongs(srcSeg); @@ -136,7 +137,7 @@ static CompactSketch segmentToCompact( final boolean srcSingleFlag = (srcFlags & SINGLEITEM_FLAG_MASK) > 0; final boolean single = srcSingleFlag - || SingleItemSketch.otherCheckForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags); + || SingleItemSketch.checkForSingleItem(srcPreLongs, srcSerVer, srcFamId, srcFlags); //extract pre1 and pre2 fields final int curCount = single ? 1 : (srcPreLongs > 1) ? extractCurCount(srcSeg) : 0; @@ -318,7 +319,7 @@ static long[] compactCache(final long[] srcCache, final int curCount, * This is checked in all compacting operations. * 7 <1.0 !0 F OK This corresponds to a sketch in estimation mode * - * #4 is handled by correctThetaOnCompat(boolean, int) (below). + * #4 is handled by correctThetaOnCompact(boolean, int) (below). * #2 & #6 handled by checkIllegalCurCountAndEmpty(boolean, int) */ diff --git a/src/main/java/org/apache/datasketches/theta/CompactSketch.java b/src/main/java/org/apache/datasketches/theta/CompactSketch.java index fb7577deb..aaa751af0 100644 --- a/src/main/java/org/apache/datasketches/theta/CompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/CompactSketch.java @@ -22,7 +22,6 @@ import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED; -import static org.apache.datasketches.common.ByteArrayUtil.getShortLE; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; @@ -30,7 +29,6 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT; import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; @@ -39,7 +37,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4; import static org.apache.datasketches.theta.PreambleUtil.wholeBytesToHoldBits; -import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; +import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem; import java.lang.foreign.MemorySegment; @@ -160,7 +158,7 @@ public static CompactSketch wrap(final MemorySegment srcSeg, final long expected if (PreambleUtil.isEmptyFlag(srcSeg)) { return EmptyCompactSketch.getHeapInstance(srcSeg); } - if (otherCheckForSingleItem(srcSeg)) { + if (checkForSingleItem(srcSeg)) { return SingleItemSketch.heapify(srcSeg, seedHash); } //not empty & not singleItem @@ -190,25 +188,20 @@ public static CompactSketch wrap(final MemorySegment srcSeg, final long expected * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

    + *

    Only sketches that have been explicitly stored as direct sketches can be wrapped.

    * *

    Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

    * - *

    This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image. - * Note that SerialVersion 1 (pre-open-source) sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.

    + *

    This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image.

    * * @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED. * * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes) { - return wrap(bytes, Util.DEFAULT_UPDATE_SEED, false); + return wrap(bytes, Util.DEFAULT_UPDATE_SEED); } /** @@ -216,18 +209,13 @@ public static CompactSketch wrap(final byte[] bytes) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".

    + *

    Only sketches that have been explicitly stored as direct sketches can be wrapped.

    * *

    Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.

    * - *

    This method checks if the given expectedSeed was used to create the source byte array image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.

    + *

    This method checks if the given expectedSeed was used to create the source byte array image.

    * * @param bytes a byte array image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given byte array image. @@ -235,18 +223,13 @@ public static CompactSketch wrap(final byte[] bytes) { * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) { - return wrap(bytes, expectedSeed, true); - } - - private static CompactSketch wrap(final byte[] bytes, final long seed, final boolean enforceSeed) { final int serVer = bytes[PreambleUtil.SER_VER_BYTE]; final int familyId = bytes[PreambleUtil.FAMILY_BYTE]; final Family family = Family.idToFamily(familyId); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); - + final short seedHash = Util.computeSeedHash(expectedSeed); if (serVer == 3) { final int flags = bytes[FLAGS_BYTE]; @@ -254,8 +237,8 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); } final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + if (checkForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), seedHash); } //not empty & not singleItem final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; @@ -268,8 +251,7 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo throw new SketchesArgumentException( "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + return WrappedCompactSketch.wrapInstance(bytes, seedHash); } if (serVer ==4) { return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); @@ -277,8 +259,6 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo //not SerVer 3 or 4 throw new SketchesArgumentException( "Corrupted: Serialization Version " + serVer + " not recognized."); - - } //Sketch Overrides diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java index 55d7aa31e..4a3b80839 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java @@ -84,7 +84,7 @@ public int getCurrentBytes() { private static final int START_PACKED_DATA_ESTIMATION_MODE = 16; @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java index b289a1dbf..f393dc5b8 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java @@ -28,7 +28,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; +import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem; import java.lang.foreign.MemorySegment; @@ -80,15 +80,15 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe @Override public int getCurrentBytes() { - if (otherCheckForSingleItem(seg_)) { return 16; } + if (checkForSingleItem(seg_)) { return 16; } final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); return (preLongs + curCount) << 3; } @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid - if (otherCheckForSingleItem(seg_)) { return 1; } + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch + if (checkForSingleItem(seg_)) { return 1; } final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs == 1) ? 0 : extractCurCount(seg_); } @@ -146,7 +146,7 @@ public byte[] toByteArray() { @Override long[] getCache() { - if (otherCheckForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } + if (checkForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); if (curCount > 0) { diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 3480ac2ea..723b6cc75 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -179,7 +179,7 @@ static DirectQuickSelectSketch writableWrap( final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); if (isResizeFactorIncorrect(srcSeg, lgNomLongs, lgArrLongs)) { diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index e3f7197cc..f78fbced4 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -28,7 +28,6 @@ import static org.apache.datasketches.theta.CompactOperations.correctThetaOnCompact; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE; -//import static org.apache.datasketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_RESIZE_FACTOR_BIT; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.P_FLOAT; @@ -38,6 +37,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong; import java.lang.foreign.MemorySegment; @@ -102,12 +102,11 @@ private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) { * @return instance of this sketch */ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 + final int preambleLongs = checkSegPreambleCap(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); return new DirectQuickSelectSketchR(seed, srcSeg); } @@ -147,7 +146,7 @@ public Family getFamily() { } @Override - public int getRetainedEntries(final boolean valid) { //always valid for theta + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return wseg_.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } diff --git a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java index 45a17d40d..793ce1763 100644 --- a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java @@ -86,7 +86,7 @@ public int getCurrentBytes() { public double getEstimate() { return 0; } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 0; } diff --git a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java index 6aeb09401..5a5c16f00 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java @@ -208,7 +208,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch if (curCount_ > 0) { if (valid && isDirty()) { return HashOperations.countPart(getCache(), getLgArrLongs(), getThetaLong()); diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java index 50c419e61..69eebff5f 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java @@ -88,7 +88,7 @@ public int getCurrentBytes() { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return curCount_; } diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java index b51273404..c23deebf1 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java @@ -111,7 +111,7 @@ static HeapQuickSelectSketch heapifyInstance(final MemorySegment srcSeg, final l final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); final float p = extractP(srcSeg); //bytes 12-15 @@ -149,7 +149,7 @@ public Family getFamily() { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return curCount_; } diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index a95ebaaf6..ff35dfdaf 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -204,7 +204,7 @@ private PreambleUtil() {} static final int UNION_THETA_LONG = 24; //8-byte aligned, only used by Union // flag byte bit masks - static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. + static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. Was BigEndian static final int READ_ONLY_FLAG_MASK = 2; //Bit 1: Reserved, Set but not read. static final int EMPTY_FLAG_MASK = 4; //Bit 2: static final int COMPACT_FLAG_MASK = 8; //Bit 3: @@ -256,7 +256,7 @@ static String preambleToString(final byte[] byteArr) { * @return the summary preamble string. */ static String preambleToString(final MemorySegment seg) { - final int preLongs = getAndCheckPreLongs(seg); + final int preLongs = checkSegPreambleCap(seg); final int rfId = extractLgResizeFactor(seg); final ResizeFactor rf = ResizeFactor.getRF(rfId); final int serVer = extractSerVer(seg); @@ -515,17 +515,16 @@ static boolean isEmptyFlag(final MemorySegment seg) { * @param seg the given MemorySegment * @return the extracted prelongs value. */ - static int getAndCheckPreLongs(final MemorySegment seg) { - final long cap = seg.byteSize(); - if (cap < 8) { - throwNotBigEnough(cap, 8); + static int checkSegPreambleCap(final MemorySegment seg) { + try { + final int preLongs = extractPreLongs(seg); + final int required = Math.max(preLongs << 3, 8); + final long cap = seg.byteSize(); + if (cap < required) { throwNotBigEnough(cap, required); } + return preLongs; + } catch (IndexOutOfBoundsException e) { //thrown by MemorySegment + throw new SketchesArgumentException("Possible Corruption: Given MemorySegment is empty."); } - final int preLongs = extractPreLongs(seg); - final int required = Math.max(preLongs << 3, 8); - if (cap < required) { - throwNotBigEnough(cap, required); - } - return preLongs; } static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { @@ -534,10 +533,10 @@ static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { return seedHashSeg; } - private static void throwNotBigEnough(final long cap, final int required) { + private static void throwNotBigEnough(final long cap, final long required) { throw new SketchesArgumentException( - "Possible Corruption: Size of byte array or MemorySegment not large enough: Size: " + cap - + ", Required: " + required); + "Possible Corruption: Size of MemorySegment not large enough: Size: " + cap + + " < Required: " + required); } static int wholeBytesToHoldBits(final int bits) { diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java index 062c6d86d..766e1850d 100644 --- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java +++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java @@ -24,6 +24,7 @@ import static org.apache.datasketches.common.ByteArrayUtil.putLongLE; import static org.apache.datasketches.hash.MurmurHash3.hash; import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; @@ -44,13 +45,13 @@ final class SingleItemSketch extends CompactSketch { private static final long DEFAULT_SEED_HASH = Util.computeSeedHash(Util.DEFAULT_UPDATE_SEED) & 0xFFFFL; // For backward compatibility, a candidate pre0_ long must have: - // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, LittleEndian = 11010 = 0x1A. + // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, NOT BigEndian = 11010 = 0x1A. (without SI flag) // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches, it is virtually guaranteed that we have a SingleItem Sketch. - private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //with SI flag + private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //low 6 bytes, with SI flag private long pre0_ = 0; private long hash_ = 0; @@ -83,7 +84,7 @@ private SingleItemSketch(final long hash) { */ //does not override Sketch static SingleItemSketch heapify(final MemorySegment srcSeg, final short expectedSeedHash) { Util.checkSeedHashes((short) extractSeedHash(srcSeg), expectedSeedHash); - final boolean singleItem = otherCheckForSingleItem(srcSeg); + final boolean singleItem = checkForSingleItem(srcSeg); if (singleItem) { return new SingleItemSketch(srcSeg.get(JAVA_LONG_UNALIGNED, 8), expectedSeedHash); } throw new SketchesArgumentException("Input MemorySegment is not a SingleItemSketch."); } @@ -329,7 +330,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 1; } @@ -383,25 +384,28 @@ short getSeedHash() { return (short) (pre0_ >>> 48); } - static boolean otherCheckForSingleItem(final MemorySegment seg) { - return otherCheckForSingleItem(Sketch.getPreambleLongs(seg), extractSerVer(seg), - extractFamilyID(seg), extractFlags(seg) ); + static boolean checkForSingleItem(final MemorySegment seg) { + final int preLongs = checkSegPreambleCap(seg); + return checkForSingleItem(preLongs, extractSerVer(seg), extractFamilyID(seg), extractFlags(seg) ); } - static boolean otherCheckForSingleItem(final int preLongs, final int serVer, - final int famId, final int flags) { - // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved=F = X11010 = 0x1A. + static boolean checkForSingleItem( + final int preLongs, + final int serVer, + final int famId, + final int flags) { + // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved(BE)=F = X11010 = 0x1A. // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches (not done here), it is virtually guaranteed that we have a // SingleItem Sketch. - final boolean numPreLongs = preLongs == 1; - final boolean numSerVer = serVer >= 3; - final boolean numFamId = famId == Family.COMPACT.getID(); - final boolean numFlags = (flags & 0x1F) == 0x1A; //no SI, yet - final boolean singleFlag = (flags & SINGLEITEM_FLAG_MASK) > 0; - return (numPreLongs && numSerVer && numFamId && numFlags) || singleFlag; + final boolean preLongsOK = preLongs == 1; + final boolean serVerOK = serVer >= 3; + final boolean famIdOK = famId == Family.COMPACT.getID(); + final boolean flagsOK = (flags & 0x1F) == 0x1A; //no SI, yet + final boolean singleFlagOK = (flags & SINGLEITEM_FLAG_MASK) > 0; + return (preLongsOK && serVerOK && famIdOK && flagsOK) || singleFlagOK; } } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 33593e430..d14519062 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -19,9 +19,6 @@ package org.apache.datasketches.theta; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.LS; @@ -29,15 +26,12 @@ import static org.apache.datasketches.common.Util.zeroPad; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; +import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; +import static org.apache.datasketches.theta.PreambleUtil.extractFlags; +import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.PreambleUtil.getAndCheckPreLongs; import static org.apache.datasketches.thetacommon.HashOperations.count; import java.lang.foreign.MemorySegment; @@ -70,9 +64,6 @@ public abstract class Sketch implements MemorySegmentStatus { * Default Update Seed

    * was used to create the source MemorySegment image. * - *

    For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.

    - * * @param srcSeg an image of a Sketch. * * @return a Sketch on the heap. @@ -89,9 +80,6 @@ public static Sketch heapify(final MemorySegment srcSeg) { *

    For Update Sketches this method checks if the expectedSeed * was used to create the source MemorySegment image.

    * - *

    For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.

    - * * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. @@ -99,9 +87,9 @@ public static Sketch heapify(final MemorySegment srcSeg) { * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); - if (family == Family.COMPACT) { + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.COMPACT.getID()) { return CompactSketch.heapify(srcSeg, expectedSeed); } return heapifyUpdateSketchFromMemorySegment(srcSeg, expectedSeed); @@ -112,10 +100,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to "wrap".

    + *

    Only sketches that have been explicitly stored as direct sketches can be wrapped.

    * *

    Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. @@ -137,10 +122,7 @@ public static Sketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *

    Only "Direct" sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to "wrap".

    + *

    Only sketches that have been explicitly stored as direct sketches can be wrapped.

    * *

    Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. @@ -155,21 +137,15 @@ public static Sketch wrap(final MemorySegment srcSeg) { * @return a read-only Sketch backed by the given MemorySegment. */ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { - final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family == Family.QUICKSELECT) { - if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed); - } else { - throw new SketchesArgumentException( - "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3"); - } + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.QUICKSELECT.getID()) { + return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed); } - if (family == Family.COMPACT) { + if (familyID == Family.COMPACT.getID()) { return CompactSketch.wrap(srcSeg, expectedSeed); } + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "Cannot wrap family: " + family + " as a Sketch"); } @@ -260,10 +236,11 @@ public int getCountLessThanThetaLong(final long thetaLong) { * @return the result estimate */ public static double getEstimate(final MemorySegment srcSeg) { - final int famId = extractFamilyID(srcSeg); - if (!isValidSketchID(famId)) { - throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " - + Family.idToFamily(famId).toString()); + checkSegPreambleCap(srcSeg); + final int familyId = extractFamilyID(srcSeg); + if (!isValidSketchID(familyId)) { + throw new SketchesArgumentException("Source MemorySegment not a valid Sketch Family: " + + Family.idToFamily(familyId).toString()); } return Sketch.estimate(extractThetaLong(srcSeg), getRetainedEntries(srcSeg)); } @@ -338,53 +315,42 @@ public static int getUpdateSketchMaxBytes(final int lgNomEntries) { /** * Returns the number of valid entries that have been retained by the sketch. - * @return the number of valid retained entries + * For the Alpha Sketch this returns only valid entries. + * @return the number of valid retained entries. */ public int getRetainedEntries() { return getRetainedEntries(true); } + /** + * Returns the number of entries that have been retained by the sketch. + * @param valid This parameter is only relevant for the Alpha Sketch. + * if true, returns the number of valid entries, which are less than theta and used + * for estimation. Otherwise, return the number of all entries, valid or not, that are currently in the + * internal sketch cache. + * @return the number of retained entries + */ + public abstract int getRetainedEntries(final boolean valid); + /** * Returns the number of valid entries that have been retained by the sketch from the given MemorySegment * @param srcSeg the given MemorySegment that has an image of a Sketch * @return the number of valid retained entries */ public static int getRetainedEntries(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); - if (Sketch.getThetaLong(srcSeg) == Long.MAX_VALUE && entries == 0) { - return 0; - } - return entries; - } - - final int preLongs = Sketch.getPreambleLongs(srcSeg); - final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; - if (preLongs == 1) { - return empty ? 0 : 1; - } - //preLongs > 1 - return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); + final int preLongs = checkSegPreambleCap(srcSeg); + final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0; + return (preLongs == 1) ? (empty ? 0 : 1) : extractCurCount(srcSeg); } - /** - * Returns the number of entries that have been retained by the sketch. - * @param valid if true, returns the number of valid entries, which are less than theta and used - * for estimation. - * Otherwise, return the number of all entries, valid or not, that are currently in the internal - * sketch cache. - * @return the number of retained entries - */ - public abstract int getRetainedEntries(boolean valid); - /** * Returns the serialization version from the given MemorySegment * @param seg the sketch MemorySegment * @return the serialization version from the MemorySegment */ public static int getSerializationVersion(final MemorySegment seg) { - return seg.get(JAVA_BYTE, SER_VER_BYTE); + checkSegPreambleCap(seg); + return extractSerVer(seg); } /** @@ -624,20 +590,21 @@ public static String toString(final MemorySegment seg) { abstract short getSeedHash(); static boolean getEmpty(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); + checkSegPreambleCap(srcSeg); + final int serVer = extractSerVer(srcSeg); if (serVer == 1) { return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; } - return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; + return (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0; } static int getPreambleLongs(final MemorySegment srcSeg) { - return getAndCheckPreLongs(srcSeg); + return checkSegPreambleCap(srcSeg); } static long getThetaLong(final MemorySegment srcSeg) { - final int preLongs = Sketch.getPreambleLongs(srcSeg); - return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); + final int preLongs = checkSegPreambleCap(srcSeg); + return preLongs < 3 ? Long.MAX_VALUE : extractThetaLong(srcSeg); } /** @@ -702,20 +669,14 @@ static final double upperBound(final int curCount, final long thetaLong, final i * @return a Sketch */ private static final Sketch heapifyUpdateSketchFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { - final long cap = srcSeg.byteSize(); - if (cap < 8) { - throw new SketchesArgumentException( - "Corrupted: valid sketch must be at least 8 bytes."); - } - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); + final Family family = idToFamily(extractFamilyID(srcSeg)); if (family == Family.ALPHA) { - final int flags = PreambleUtil.extractFlags(srcSeg); + final int flags = extractFlags(srcSeg); final boolean compactFlag = (flags & COMPACT_FLAG_MASK) != 0; if (compactFlag) { throw new SketchesArgumentException( - "Corrupted: ALPHA family image: cannot be compact"); + "Corrupted: An ALPHA family image cannot be compact"); } return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed); } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 4cd3a4cd4..7eb69eccb 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -19,19 +19,15 @@ package org.apache.datasketches.theta; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; -import static org.apache.datasketches.common.Util.checkBounds; import static org.apache.datasketches.hash.MurmurHash3.hash; import static org.apache.datasketches.theta.CompactOperations.componentsToCompact; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.SER_VER; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.checkSegmentSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; @@ -106,12 +102,11 @@ public static UpdateSketch wrap( final MemorySegmentRequest mSegReq, final long expectedSeed) { Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null"); - checkBounds(0, 24, srcWSeg.byteSize()); //need min 24 bytes - final int preLongs = srcWSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits - final int serVer = srcWSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; //mask to byte - final int familyID = srcWSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; //mask to byte - final Family family = Family.idToFamily(familyID); - if (family != Family.QUICKSELECT) { + final int preLongs = checkSegPreambleCap(srcWSeg) & 0X3F; //mask to 6 bits; + final int serVer = extractSerVer(srcWSeg); + final int familyID = extractFamilyID(srcWSeg); + if (familyID != Family.QUICKSELECT.getID()) { + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "A " + family + " sketch cannot be wrapped as an UpdateSketch."); } @@ -150,9 +145,9 @@ public static UpdateSketch heapify(final MemorySegment srcSeg) { */ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); - checkBounds(0, 24, srcSeg.byteSize()); //need min 24 bytes - final Family family = Family.idToFamily(srcSeg.get(JAVA_BYTE, FAMILY_BYTE)); - if (family.equals(Family.ALPHA)) { + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.ALPHA.getID()) { return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed); } return HeapQuickSelectSketch.heapifyInstance(srcSeg, expectedSeed); @@ -418,23 +413,23 @@ public UpdateReturnState update(final long[] data) { */ abstract boolean isOutOfSpace(int numEntries); - static void checkUnionQuickSelectFamily(final MemorySegment seg, final int preambleLongs, - final int lgNomLongs) { + static void checkUnionAndQuickSelectFamily(final MemorySegment seg, final int preambleLongs, final int lgNomLongs) { + //Check Family final int familyID = extractFamilyID(seg); //byte 2 - final Family family = Family.idToFamily(familyID); - if (family.equals(Family.UNION)) { + if (familyID == Family.UNION.getID()) { if (preambleLongs != Family.UNION.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs); } } - else if (family.equals(Family.QUICKSELECT)) { + else if (familyID == Family.QUICKSELECT.getID()) { if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) { throw new SketchesArgumentException( "Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs); } } else { + final Family family = Family.idToFamily(familyID); throw new SketchesArgumentException( "Possible corruption: Invalid Family: " + family.toString()); } diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java index c4affc9ce..584338469 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java @@ -69,7 +69,7 @@ public int getCurrentBytes() { private static final int START_PACKED_DATA_ESTIMATION_MODE = 16; @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java index 08726a7ff..1f3f3ab9e 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java @@ -80,7 +80,7 @@ public int getCurrentBytes() { } @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch final int preLongs = bytes_[PREAMBLE_LONGS_BYTE]; return (preLongs == 1) ? 0 : getIntLE(bytes_, RETAINED_ENTRIES_INT); } From 72c7a2467c210280e1efc5958b245191e1cb5fd9 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 3 Oct 2025 13:07:03 -0700 Subject: [PATCH 24/26] In response to the incorrect Pilot objection, I am adding a few sanity parenthesis to make expressions easier to understand. --- .../java/org/apache/datasketches/theta/HeapUpdateSketch.java | 2 +- src/main/java/org/apache/datasketches/theta/PreambleUtil.java | 2 +- src/main/java/org/apache/datasketches/theta/UpdateSketch.java | 2 +- .../java/org/apache/datasketches/theta/UpdateSketchTest.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java index aff348281..56175a019 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java @@ -96,7 +96,7 @@ short getSeedHash() { byte[] toByteArray(final int preLongs, final byte familyID) { if (isDirty()) { rebuild(); } checkIllegalCurCountAndEmpty(isEmpty(), getRetainedEntries(true)); - final int preBytes = preLongs << 3 & 0X3F; //24 bytes; mask to 6 bits + final int preBytes = (preLongs << 3) & 0X3F; //24 bytes; mask to 6 bits final int dataBytes = getCurrentDataLongs() << 3; final byte[] byteArrOut = new byte[preBytes + dataBytes]; diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index ff35dfdaf..4dd993eb3 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -450,7 +450,7 @@ static void insertLgResizeFactor(final MemorySegment seg, final int rf) { final int curByte = seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0xFF; final int shift = LG_RESIZE_FACTOR_BIT; // shift in bits final int mask = 3; - final byte newByte = (byte) ((rf & mask) << shift | ~(mask << shift) & curByte); + final byte newByte = (byte) (((rf & mask) << shift) | (~(mask << shift) & curByte)); seg.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, newByte); } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 7eb69eccb..723d57a96 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -173,7 +173,7 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstWS public int getCompactBytes() { final int preLongs = getCompactPreambleLongs(); final int dataLongs = getRetainedEntries(true); - return preLongs + dataLongs << 3; + return (preLongs + dataLongs) << 3; } @Override diff --git a/src/test/java/org/apache/datasketches/theta/UpdateSketchTest.java b/src/test/java/org/apache/datasketches/theta/UpdateSketchTest.java index 3e067d2aa..a3bdaa221 100644 --- a/src/test/java/org/apache/datasketches/theta/UpdateSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/UpdateSketchTest.java @@ -212,7 +212,7 @@ public void checkCompactOpsMemorySegmentToCompact() { CompactSketch csk1, csk2, csk3; final int lgK = 6; final UpdateSketch sk = UpdateSketch.builder().setLogNominalEntries(lgK).build(); - final int n = 1 << lgK + 1; + final int n = 1 << (lgK + 1); for (int i = 2; i < n; i++) { sk.update(i); } final int cbytes = sk.getCompactBytes(); final byte[] byteArr = sk.toByteArray(); From f9e37ecbd8115d83bdad299777ccc99a233eebd0 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sat, 4 Oct 2025 10:36:02 -0700 Subject: [PATCH 25/26] Correct spelling of "HLL" in Javadoc --- src/main/java/org/apache/datasketches/hll/TgtHllType.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/datasketches/hll/TgtHllType.java b/src/main/java/org/apache/datasketches/hll/TgtHllType.java index b7f8d45ad..78aaafd1b 100644 --- a/src/main/java/org/apache/datasketches/hll/TgtHllType.java +++ b/src/main/java/org/apache/datasketches/hll/TgtHllType.java @@ -60,7 +60,7 @@ public enum TgtHllType { */ HLL_6, /** - * An Hll Sketch with a bin size of 8 bits + * An HLL Sketch with a bin size of 8 bits */ HLL_8; From 2a8c59122c6a2c6b514cf9649c7412c6e06c4cd0 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Sat, 4 Oct 2025 11:45:44 -0700 Subject: [PATCH 26/26] Spelling error --- src/main/java/org/apache/datasketches/cpc/CpcSketch.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/cpc/CpcSketch.java b/src/main/java/org/apache/datasketches/cpc/CpcSketch.java index bd154eb5f..212670c50 100644 --- a/src/main/java/org/apache/datasketches/cpc/CpcSketch.java +++ b/src/main/java/org/apache/datasketches/cpc/CpcSketch.java @@ -19,9 +19,9 @@ package org.apache.datasketches.cpc; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.lang.Math.log; import static java.lang.Math.sqrt; +import static java.lang.foreign.ValueLayout.JAVA_BYTE; import static java.nio.charset.StandardCharsets.UTF_8; import static org.apache.datasketches.common.Util.LS; import static org.apache.datasketches.common.Util.invPow2; @@ -81,7 +81,7 @@ public final class CpcSketch { byte[] slidingWindow; //either null or size K bytes PairTable pairTable; //for sparse and surprising values, either null or variable size - //The following variables are only valid in HIP varients + //The following variables are only valid in HIP variants double kxp; //used with HIP double hipEstAccum; //used with HIP