From b193b7012ca0460548502c576c9f5da4b6d6a242 Mon Sep 17 00:00:00 2001
From: Lee Rhodes The given dstSeg is checked for the required capacity as determined by
* {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}.
* @param srcWseg an writable image of a valid source sketch with data.
- * @return an HllSketch where the sketch data is in the given dstSeg.
+ * @return an HllSketch where the sketch data is in the given srcWseg.
*/
public static final HllSketch writableWrap(final MemorySegment srcWseg) {
if (srcWseg.isReadOnly()) { return wrap(srcWseg); }
@@ -251,7 +251,7 @@ static final HllSketch writableWrap( final MemorySegment srcWseg, final boolean
checkBounds(0, 8, srcWseg.byteSize()); //need min 8 bytes
if (extractCompactFlag(srcWseg)) {
throw new SketchesArgumentException(
- "Cannot perform a writableWrap of a writable sketch image that is in compact form. "
+ "Cannot perform a writableWrap of a sketch image that is in compact form. "
+ "Compact sketches are by definition immutable.");
}
final int lgConfigK = extractLgK(srcWseg);
From 3388762d927e1f28a8fd814d9fb02db93352aa36 Mon Sep 17 00:00:00 2001
From: Lee Rhodes Note: a single occurrence of a NaN in the array will force this method to use the conventional update path
+ * rather than the fast update path. Note: a single occurrence of a NaN in the array will force this method to use the conventional update path
+ * rather than the fast update path. This method checks if the DEFAULT_UPDATE_SEED was used to create the source MemorySegment image.
+ * This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image.
* Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field,
* so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.
This method checks if the given expectedSeed was used to create the source MemorySegment image. + *
This method checks if the given expectedSeed was used to create the source byte array image. * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.
* * @param bytes a byte array image of a Sketch that was created using the given expectedSeed. - * @param expectedSeed the seed used to validate the given MemorySegment image. + * @param expectedSeed the seed used to validate the given byte array image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) { return wrap(bytes, expectedSeed, true); diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java index 6d9c173a0..8ef889be4 100644 --- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java @@ -582,7 +582,7 @@ public void serializeDeserializeHeapV4() { } @Test - public void serializeDeserializeDirectV4() { + public void serializeDeserializeDirectV4_segment() { final UpdateSketch sk = Sketches.updateSketchBuilder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); @@ -598,6 +598,24 @@ public void serializeDeserializeDirectV4() { } } + @Test + public void serializeDeserializeDirectV4_bytes() { + final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + for (int i = 0; i < 10000; i++) { + sk.update(i); + } + final CompactSketch cs1 = sk.compact(true, MemorySegment.ofArray(new byte[sk.getCompactBytes()])); + final byte[] bytes = cs1.toByteArrayCompressed(); + final CompactSketch cs2 = CompactSketch.wrap(bytes); + assertEquals(cs1.getRetainedEntries(), cs2.getRetainedEntries()); + final HashIterator it1 = cs1.iterator(); + final HashIterator it2 = cs2.iterator(); + while (it1.next() && it2.next()) { + assertEquals(it2.get(), it2.get()); + } + } + + @Test public void serializeWrapBytesV3() { final UpdateSketch sk = Sketches.updateSketchBuilder().build(); diff --git a/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java b/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java index 7d69b3832..64449027a 100644 --- a/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java +++ b/src/test/java/org/apache/datasketches/theta/ThetaSketchCrossLanguageTest.java @@ -45,7 +45,7 @@ public class ThetaSketchCrossLanguageTest { @Test(groups = {GENERATE_JAVA_FILES}) public void generateBinariesForCompatibilityTesting() throws IOException { final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000}; - for (int n: nArr) { + for (final int n: nArr) { final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < n; i++) { sk.update(i); @@ -57,7 +57,7 @@ public void generateBinariesForCompatibilityTesting() throws IOException { @Test(groups = {GENERATE_JAVA_FILES}) public void generateBinariesForCompatibilityTestingCompressed() throws IOException { final int[] nArr = {10, 100, 1000, 10_000, 100_000, 1_000_000}; - for (int n: nArr) { + for (final int n: nArr) { final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < n; i++) { sk.update(i); @@ -76,9 +76,9 @@ public void generateBinariesForCompatibilityTestingNonEmptyNoEntries() throws IO } @Test(groups = {CHECK_CPP_FILES}) - public void deserializeFromCpp() throws IOException { + public void deserializeFromCppSegment() throws IOException { final int[] nArr = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; - for (int n: nArr) { + for (final int n: nArr) { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_n" + n + "_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); @@ -95,9 +95,28 @@ public void deserializeFromCpp() throws IOException { } @Test(groups = {CHECK_CPP_FILES}) - public void deserializeFromCppCompressed() throws IOException { + public void deserializeFromCppBytes() throws IOException { + final int[] nArr = {0, 1, 10, 100, 1000, 10000, 100000, 1000000}; + for (final int n: nArr) { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_n" + n + "_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); + assertEquals(sketch.getEstimate(), n, n * 0.03); + assertTrue(sketch.isOrdered()); + final HashIterator it = sketch.iterator(); + long previous = 0; + while (it.next()) { + assertTrue(it.get() < sketch.getThetaLong()); + assertTrue(it.get() > previous); + previous = it.get(); + } + } + } + + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppCompressedSegment() throws IOException { final int[] nArr = {10, 100, 1000, 10000, 100000, 1000000}; - for (int n: nArr) { + for (final int n: nArr) { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_compressed_n" + n + "_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); @@ -114,11 +133,38 @@ public void deserializeFromCppCompressed() throws IOException { } @Test(groups = {CHECK_CPP_FILES}) - public void deserializeFromCppNonEmptyNoEntries() throws IOException { + public void deserializeFromCppCompressedBytes() throws IOException { + final int[] nArr = {10, 100, 1000, 10000, 100000, 1000000}; + for (final int n: nArr) { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_compressed_n" + n + "_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertTrue(n == 0 ? sketch.isEmpty() : !sketch.isEmpty()); + assertEquals(sketch.getEstimate(), n, n * 0.03); + assertTrue(sketch.isOrdered()); + final HashIterator it = sketch.iterator(); + long previous = 0; + while (it.next()) { + assertTrue(it.get() < sketch.getThetaLong()); + assertTrue(it.get() > previous); + previous = it.get(); + } + } + } + + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppNonEmptyNoEntriesSegment() throws IOException { final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_non_empty_no_entries_cpp.sk")); final CompactSketch sketch = CompactSketch.wrap(MemorySegment.ofArray(bytes)); assertFalse(sketch.isEmpty()); assertEquals(sketch.getRetainedEntries(), 0); } + @Test(groups = {CHECK_CPP_FILES}) + public void deserializeFromCppNonEmptyNoEntriesBytes() throws IOException { + final byte[] bytes = Files.readAllBytes(cppPath.resolve("theta_non_empty_no_entries_cpp.sk")); + final CompactSketch sketch = CompactSketch.wrap(bytes); + assertFalse(sketch.isEmpty()); + assertEquals(sketch.getRetainedEntries(), 0); + } + } From 5e2e17ad0ceca6c26733e6c7e97d24fb941a7754 Mon Sep 17 00:00:00 2001 From: Lee RhodesOnly "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *
Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a heapify operation. * These early versions were never designed to "wrap".
@@ -242,7 +246,7 @@ else if (serVer == 2) { * This is actually faster and consumes less overall space. * *This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, + * Note that SerialVersion 1 (pre-open-source) sketches cannot be checked as they don't have a seedHash field, * so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.
* * @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED. @@ -258,7 +262,7 @@ public static CompactSketch wrap(final byte[] bytes) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *
Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a heapify operation. * These early versions were never designed to "wrap".
@@ -288,38 +292,46 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } final short seedHash = Util.computeSeedHash(seed); - if (serVer == 4) { - return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); - } else if (serVer == 3) { - final int flags = bytes[FLAGS_BYTE]; - if ((flags & EMPTY_FLAG_MASK) > 0) { - return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); + + switch (serVer) { + case 1: { + return ForwardCompatibility.heapify1to3(MemorySegment.ofArray(bytes), seedHash); } - final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + case 2: { + return ForwardCompatibility.heapify2to3(MemorySegment.ofArray(bytes), + enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); } - //not empty & not singleItem - final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; - if (!compactFlag) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have compact flag set"); + case 3: { + final int flags = bytes[FLAGS_BYTE]; + if ((flags & EMPTY_FLAG_MASK) > 0) { + return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); + } + final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; + if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + } + //not empty & not singleItem + final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; + if (!compactFlag) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have compact flag set"); + } + final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; + if (!readOnly) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have Read-Only flag set"); + } + return WrappedCompactSketch.wrapInstance(bytes, + enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + } + case 4: { + return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); } - final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; - if (!readOnly) { + default: { throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have Read-Only flag set"); + "Corrupted: Serialization Version " + serVer + " not recognized."); } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); - } else if (serVer == 1) { - return ForwardCompatibility.heapify1to3(MemorySegment.ofArray(bytes), seedHash); - } else if (serVer == 2) { - return ForwardCompatibility.heapify2to3(MemorySegment.ofArray(bytes), - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); } - throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); } //Sketch Overrides diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index 0a81f4887..9926c9b79 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -51,8 +51,7 @@ import org.apache.datasketches.thetacommon.ThetaUtil; /** - * The default Theta Sketch using the QuickSelect algorithm. - * This is the read-only implementation with non-functional methods, which affect the state. + * The read-only Theta Sketch using the QuickSelect algorithm. * *This implementation uses data in a given MemorySegment that is owned and managed by the caller. * This MemorySegment can be off-heap, which if managed properly will greatly reduce the need for @@ -65,17 +64,16 @@ class DirectQuickSelectSketchR extends UpdateSketch { static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space final long seed_; //provided, kept only on heap, never serialized. int hashTableThreshold_; //computed, kept only on heap, never serialized. - MemorySegment wseg_; //A MemorySegment for child class, but no write methods here + MemorySegment wseg_; //This reference is shared with the writable child class, but no write methods here - //only called by DirectQuickSelectSketch and below + //only called by the writable DirectQuickSelectSketch and this class. DirectQuickSelectSketchR(final long seed, final MemorySegment wseg) { seed_ = seed; wseg_ = wseg; } /** - * Wrap a sketch around the given source MemorySegment containing sketch data that originated from - * this sketch. + * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg the source MemorySegment. * The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed @@ -89,8 +87,7 @@ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final l UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); - final DirectQuickSelectSketchR dqssr = - new DirectQuickSelectSketchR(seed, srcSeg); + final DirectQuickSelectSketchR dqssr = new DirectQuickSelectSketchR(seed, srcSeg); dqssr.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqssr; } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 6310d82c4..e551f33c4 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -63,7 +63,7 @@ public abstract class Sketch implements MemorySegmentStatus { * was used to create the source MemorySegment image. * *
For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.
+ * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked. * * @param srcSeg an image of a Sketch. * @@ -83,8 +83,12 @@ public static Sketch heapify(final MemorySegment srcSeg) { * *The resulting sketch will not retain any link to the source MemorySegment.
* - *For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.
+ *For Update Sketches this method checks if the + * Default Update Seed
+ * was used to create the source MemorySegment image. + * + *For Compact Sketches this method assumes that the sketch image was created with the + * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.
* * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -109,8 +113,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".
+ * where all data will be copied to the heap. These early versions were never designed to "wrap". * *Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. @@ -121,10 +124,10 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * was used to create the source MemorySegment image. * *
For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked.
+ * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked. * - * @param srcSeg an image of a Sketch. - * @return a Sketch backed by the given MemorySegment + * @param srcSeg a MemorySegment with an image of a Sketch. + * @return a read-only Sketch backed by the given MemorySegment */ public static Sketch wrap(final MemorySegment srcSeg) { final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; @@ -154,20 +157,23 @@ public static Sketch wrap(final MemorySegment srcSeg) { *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to - * "wrap".
+ * where all data will be copied to the heap. These early versions were never designed to "wrap". * *Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *For Update and Compact Sketches this method checks if the given expectedSeed was used to - * create the source MemorySegment image. However, SerialVersion 1 sketches cannot be checked.
+ *For Update Sketches this method checks if the + * Default Update Seed
+ * was used to create the source MemorySegment image. + * + *For Compact Sketches this method assumes that the sketch image was created with the + * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked.
* * @param srcSeg a MemorySegment with an image of a Sketch. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a UpdateSketch backed by the given MemorySegment except as above. + * @return a read-only Sketch backed by the given MemorySegment. */ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; @@ -203,7 +209,7 @@ public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) { * @return this sketch as an ordered CompactSketch. */ public CompactSketch compact() { - return (this.isCompact()) ? (CompactSketch)this : compact(true, null); + return isCompact() ? (CompactSketch)this : compact(true, null); } /** diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java index e9a952ab4..c4affc9ce 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java @@ -28,7 +28,8 @@ import org.apache.datasketches.common.Util; /** - * Wrapper around a serialized compact compressed read-only sketch. It is not empty, not a single item. + * A wrapper around a serialized compact compressed read-only sketch in the form of a byte array. + * It is not an empty nor a single item sketch. * *This sketch can only be associated with a Serialization Version 4 format binary image.
*/ diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java index a5b67363f..08726a7ff 100644 --- a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java @@ -37,9 +37,10 @@ import org.apache.datasketches.common.Util; /** - * Wrapper around a serialized compact read-only sketch. It is not empty, not a single item. + * A wrapper around a serialized compact read-only sketch in the form of a byte array. + * It is not an empty nor a single item sketch. * - *This sketch can only be associated with a Serialization Version 3 format binary image.
+ *This sketch can only be associated with a Serialization Version 3 binary image format.
*/ class WrappedCompactSketch extends CompactSketch { final byte[] bytes_; From 88d603729132ab67dde0193fe42b22a8543521ae Mon Sep 17 00:00:00 2001 From: Lee RhodesThis implementation uses data in a given MemorySegment that is owned and managed by the caller. * This MemorySegment can be off-heap, which if managed properly will greatly reduce the need for @@ -61,16 +60,41 @@ * @author Kevin Lang */ class DirectQuickSelectSketchR extends UpdateSketch { - static final double DQS_RESIZE_THRESHOLD = 15.0 / 16.0; //tuned for space - int hashTableThreshold_; //computed, kept only on heap, never serialized. - MemorySegment wseg_; //This reference is shared with the writable child class, but no write methods here - //only called by the writable DirectQuickSelectSketch and this class. - DirectQuickSelectSketchR(final long seed, final MemorySegment wseg) { + /** + * This MemorySegment reference is also used by the writable child DirectQuickSelectSketch. + * + *
When this class is constructed with the writable constructor, called by the writable child DirectQuickSelectSketch, + * this reference can be changed, its contents can be modified.
+ * + *When this class is constructed with the read-only constructor, called from local factories, this MemorySegment will + * be placed in read-only mode.
+ */ + MemorySegment wseg_; // + + /** + * This writable constructor is only called by the writable child DirectQuickSelectSketch and then this class provides the + * read-only methods for the DirectQuickSelectSketch class. + * @param wseg the writable MemorySegment used by the writable child DirectQuickSelectSketch. + * @param seed the seed for the update function for the writable child DirectQuickSelectSketch. + */ + DirectQuickSelectSketchR(final MemorySegment wseg, final long seed) { + Objects.requireNonNull(wseg, "MemorySegment wseg must not be null"); super(seed); wseg_ = wseg; } + /** + * This read-only constructor is only called by local factory methods which use this class as a read-only direct sketch. + * @param seed the seed used to validate the internal hashes of the given source MemorySegment. + * @param srcSeg the read-only MemorySegment used by this class in read-only mode. + */ + private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) { + Objects.requireNonNull(srcSeg, "MemorySegment srcSeg must not be null"); + super(seed); + wseg_ = srcSeg.asReadOnly(); + } + /** * Wrap a sketch around the given source MemorySegment containing sketch data that originated from this sketch. * @param srcSeg the source MemorySegment. @@ -85,26 +109,19 @@ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final l UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); - - final DirectQuickSelectSketchR dqssr = new DirectQuickSelectSketchR(seed, srcSeg); - dqssr.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqssr; + return new DirectQuickSelectSketchR(seed, srcSeg); } /** * Fast-wrap a sketch around the given source MemorySegment containing sketch data that originated from * this sketch. This does NO validity checking of the given MemorySegment. + * Caller must ensure segment contents are a valid sketch image. * @param srcSeg The given MemorySegment object must be in hash table form and not read only. * @param seed See Update Hash Seed * @return instance of this sketch */ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int lgNomLongs = srcSeg.get(JAVA_BYTE, LG_NOM_LONGS_BYTE) & 0XFF; //mask to byte - final int lgArrLongs = srcSeg.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte - - final DirectQuickSelectSketchR dqss = new DirectQuickSelectSketchR(seed, srcSeg); - dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); - return dqss; + return new DirectQuickSelectSketchR(seed, srcSeg); } //Sketch @@ -112,7 +129,7 @@ static DirectQuickSelectSketchR fastReadOnlyWrap(final MemorySegment srcSeg, fin @Override public int getCurrentBytes() { //not compact - final byte lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE); + final int lgArrLongs = wseg_.get(JAVA_BYTE, LG_ARR_LONGS_BYTE) & 0XFF; //mask to byte final int preLongs = wseg_.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits return preLongs + (1 << lgArrLongs) << 3; } @@ -131,7 +148,7 @@ public Family getFamily() { } @Override - public int getRetainedEntries(final boolean valid) { //always valid + public int getRetainedEntries(final boolean valid) { //always valid for theta return wseg_.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } @@ -157,7 +174,7 @@ public boolean isEmpty() { @Override public boolean isSameResource(final MemorySegment that) { - return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); + return hasMemorySegment() && MemorySegmentStatus.isSameResource(wseg_, that); //null checks done here } @Override @@ -167,14 +184,14 @@ public HashIterator iterator() { @Override public byte[] toByteArray() { //MY_FAMILY is stored in wseg_ - checkIllegalCurCountAndEmpty(isEmpty(), extractCurCount(wseg_)); + final int curCount = extractCurCount(wseg_); + checkIllegalCurCountAndEmpty(isEmpty(), curCount); final int lengthBytes = getCurrentBytes(); final byte[] byteArray = new byte[lengthBytes]; final MemorySegment seg = MemorySegment.ofArray(byteArray); MemorySegment.copy(wseg_, 0, seg, 0, lengthBytes); - final long thetaLong = - correctThetaOnCompact(isEmpty(), extractCurCount(wseg_), extractThetaLong(wseg_)); - insertThetaLong(wseg_, thetaLong); + final long thetaLong = correctThetaOnCompact(isEmpty(), curCount, extractThetaLong(wseg_)); + insertThetaLong(seg, thetaLong); return byteArray; } @@ -242,8 +259,8 @@ boolean isDirty() { } @Override - boolean isOutOfSpace(final int numEntries) { - return numEntries > hashTableThreshold_; + boolean isOutOfSpace(final int numEntries) { //overridden by writable DirectQuickSelectSketch + return false; } @Override @@ -260,19 +277,4 @@ UpdateReturnState hashUpdate(final long hash) { throw new SketchesReadOnlyException(); } - /** - * Returns the cardinality limit given the current size of the hash table array. - * - * @param lgNomLongs See lgNomLongs. - * @param lgArrLongs See lgArrLongs. - * @return the hash table threshold - */ - @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") - protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { - //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, - //but this allows us to tune these constants for different sketches. - final double fraction = lgArrLongs <= lgNomLongs ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; - return (int) (fraction * (1 << lgArrLongs)); - } - } diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index b3451fcd1..296fc7d5e 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -126,17 +126,17 @@ ** Long || Start Byte Adr: * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || Seed Hash | Flags | numEB | entBits| FamID | SerVer | PreLongs = 3 | + * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | + * 0 || Seed Hash | Flags | numEB | entBits| FamID | SerVer=4 | PreLongs = 3 | * - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 ||------------------------------THETA_LONG-------------------------------------------| + * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | + * 1 ||------------------------------THETA_LONG-------------------------------------------| (only if estimating) * - * || | | | (20) | (19) | (18) | (17) | 16 | - * 2 ||----------------Retained Entries stored as 1 to 4 bytes----------------------------| + * || | | | 20 | (19) | (18) | (17) | 16 | + * 2 ||--------Retained Entries stored as 1 to 4 bytes in bytes 16-19---------------------| * - * || | | | | | | | | - * 3 ||------------------Delta encoded compressed byte array------------------------------| + * || | | | | | | | | + * 3 ||--------Delta encoded compressed byte array starts at bytes 17-20------------------| ** *
The UpdateSketch and AlphaSketch require 24 bytes of preamble followed by a non-compact @@ -318,7 +318,7 @@ else if (preLongs == 4) { //Union sb.append("Byte 0: ResizeFactor : ").append(rfId + ", " + rf.toString()).append(LS); sb.append("Byte 1: Serialization Version: ").append(serVer).append(LS); sb.append("Byte 2: Family : ").append(familyId + ", " + family.toString()).append(LS); - sb.append("Byte 3: LgNomLongs : ").append(lgNomLongs).append(LS); + sb.append("Byte 3: LgNomLongs, LgK : ").append(lgNomLongs).append(LS); sb.append("Byte 4: LgArrLongs : ").append(lgArrLongs).append(LS); sb.append("Byte 5: Flags Field : ").append(flagsStr).append(LS); sb.append(" Bit Flag Name : State:").append(LS); @@ -351,8 +351,13 @@ else if (preLongs == 3) { sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); sb.append(" Theta (long) : ").append(thetaLong).append(LS); sb.append(" Theta (long,hex) : ").append(thetaHex).append(LS); + if (serVer == 4) { + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); + sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); + return sb.toString(); + } } - else { //preLongs == 4 + else { //preLongs == 4 (Union) sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS); sb.append("Bytes 12-15: P : ").append(p).append(LS); sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS); @@ -363,9 +368,8 @@ else if (preLongs == 3) { sb.append(" ThetaU (long,hex): ").append(thetaUHex).append(LS); } sb.append( "Preamble Bytes : ").append(preLongs * 8).append(LS); - sb.append( "Data Bytes : ").append(curCount * 8).append(LS); - sb.append( "TOTAL Sketch Bytes : ").append((preLongs + curCount) * 8).append(LS); - sb.append( "TOTAL Capacity Bytes : ").append(seg.byteSize()).append(LS); + sb.append( "Retained Data Bytes : ").append(curCount * 8).append(LS); + sb.append( "TOTAL Storage Bytes : ").append(seg.byteSize()).append(LS); sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS); return sb.toString(); } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index e551f33c4..5c14b8fda 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -333,6 +333,16 @@ public static int getMaxUpdateSketchBytes(final int nomEntries) { return (nomEnt << 4) + (Family.QUICKSELECT.getMaxPreLongs() << 3); } + /** + * Returns the maximum number of storage bytes required for an UpdateSketch with the given + * log_base2 of the nominal entries. + * @param lgNomEntries log_base2 of Nominal Entries + * @return the maximum number of storage bytes required for a UpdateSketch with the given lgNomEntries + */ + public static int getUpdateSketchMaxBytes(final int lgNomEntries) { + return (1 << lgNomEntries << 4) + (Family.QUICKSELECT.getMaxPreLongs() << 3); + } + /** * Returns the number of valid entries that have been retained by the sketch. * @return the number of valid retained entries @@ -451,7 +461,10 @@ public String toString() { * @param hexMode If true, hashes will be output in hex. * @return The result string, which can be very long. */ - public String toString(final boolean sketchSummary, final boolean dataDetail, final int width, + public String toString( + final boolean sketchSummary, + final boolean dataDetail, + final int width, final boolean hexMode) { final StringBuilder sb = new StringBuilder(); @@ -554,6 +567,9 @@ public static String toString(final MemorySegment seg) { /** * Gets the internal cache array. For on-heap sketches this will return a reference to the actual * cache array. For MemorySegment-based sketches this returns a copy. + * + *
This can be an expensive operation and is intended for diagnostic & test applications. + * Use {@link #iterator() iterator()} instead.
* @return the internal cache array. */ abstract long[] getCache(); diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 4635e75a0..f080e976a 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -75,6 +75,9 @@ public abstract class UpdateSketch extends Sketch { * @param srcWSeg an image of a writable sketch where the image seed hash matches the default seed hash. * It must have a size of at least 24 bytes. * @return an UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch wrap(final MemorySegment srcWSeg) { return wrap(srcWSeg, Util.DEFAULT_UPDATE_SEED); @@ -92,6 +95,9 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg) { * See Update Hash Seed. * Compact sketches store a 16-bit hash of the seed, but not the seed itself. * @return a UpdateSketch backed by the given MemorySegment + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expectedSeed) { Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null"); @@ -118,6 +124,9 @@ public static UpdateSketch wrap(final MemorySegment srcWSeg, final long expected * @param srcSeg the given MemorySegment with a sketch image. * It must have a size of at least 24 bytes. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg) { return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); @@ -130,6 +139,9 @@ public static UpdateSketch heapify(final MemorySegment srcSeg) { * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. * @return an UpdateSketch + * @throws SketchesArgumentException if the provided MemorySegment + * is invalid, corrupted, or incompatible with this sketch type. + * Callers must treat this as a fatal error for that segment. */ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null"); @@ -163,7 +175,7 @@ int getCurrentDataLongs() { @Override public boolean hasMemorySegment() { - return this instanceof DirectQuickSelectSketchR && ((DirectQuickSelectSketchR)this).hasMemorySegment(); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.hasMemorySegment(); } @Override @@ -173,7 +185,7 @@ public boolean isCompact() { @Override public boolean isOffHeap() { - return this instanceof DirectQuickSelectSketchR && ((DirectQuickSelectSketchR)this).isOffHeap(); + return this instanceof final DirectQuickSelectSketchR dqssr && dqssr.isOffHeap(); } @Override diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java index 834778f87..6c8d5f37f 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketchBuilder.java @@ -117,6 +117,22 @@ public UpdateSketchBuilder setLogNominalEntries(final int lgNomEntries) { return this; } + /** + * Alternative method of setting the Nominal Entries for this sketch from the log_base2 value, + * commonly called LgK. + * This value is also used for building a shared concurrent sketch. + * The minimum value is 4 and the maximum value is 26. + * Be aware that sketches as large as 26 may not have been + * thoroughly characterized for performance. + * + * @param lgK the Log Nominal Entries. Also for the concurrent shared sketch + * @return this UpdateSketchBuilder + */ + public UpdateSketchBuilder setLgK(final int lgK) { + bLgNomLongs = ThetaUtil.checkNomLongs(1 << lgK); + return this; + } + /** * Returns Log-base 2 Nominal Entries * @return Log-base 2 Nominal Entries diff --git a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java index 4012cb412..778dc02f2 100644 --- a/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java +++ b/src/main/java/org/apache/datasketches/thetacommon/ThetaUtil.java @@ -71,8 +71,7 @@ private ThetaUtil() {} * @param lgMin Log2 of the minimum allowed starting size * @return The Log2 of the starting size */ - public static int startingSubMultiple(final int lgTarget, final int lgRF, - final int lgMin) { + public static int startingSubMultiple(final int lgTarget, final int lgRF, final int lgMin) { return lgTarget <= lgMin ? lgMin : lgRF == 0 ? lgTarget : (lgTarget - lgMin) % lgRF + lgMin; } From fa97b7c3ee44ab2da381fe313dbe8e00a62b2542 Mon Sep 17 00:00:00 2001 From: Lee Rhodes
* Long || Start Byte Adr:
* Adr:
- * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
- * 0 || Seed Hash | Flags | LgArr | LgNom | FamID | SerVer | lgRF | PreLongs=3 |
+ * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
+ * 0 || Seed Hash | Flags | LgArr | LgNom | FamID=1 | SerVer=3 | lgRF | PreLongs=3 |
*
- * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
- * 1 ||-----------------p-----------------|----------Retained Entries Count---------------|
+ * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
+ * 1 ||-----------------p-----------------|----------Retained Entries Count-------------------|
*
- * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
- * 2 ||---------------------------------Theta---------------------------------------------|
+ * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
+ * 2 ||---------------------------------Theta-------------------------------------------------|
*
*/
diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
index fdd2860ce..50c419e61 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
@@ -57,6 +57,7 @@ final class HeapCompactSketch extends CompactSketch {
* @param curCount correct value
* @param thetaLong The correct
* thetaLong.
+ * @param ordered true if cache is ordered.
*/
HeapCompactSketch(final long[] cache, final boolean empty, final short seedHash,
final int curCount, final long thetaLong, final boolean ordered) {
diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
index 5d8af6bfb..b51273404 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
@@ -30,7 +30,6 @@
import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor;
import static org.apache.datasketches.theta.PreambleUtil.extractP;
-import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncremented;
import static org.apache.datasketches.theta.UpdateReturnState.InsertedCountIncrementedRebuilt;
@@ -108,7 +107,7 @@ private HeapQuickSelectSketch(final int lgNomLongs, final long seed, final float
* @return instance of this sketch
*/
static HeapQuickSelectSketch heapifyInstance(final MemorySegment srcSeg, final long seed) {
- final int preambleLongs = extractPreLongs(srcSeg); //byte 0
+ final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0
final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3
final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4
diff --git a/src/main/java/org/apache/datasketches/theta/Intersection.java b/src/main/java/org/apache/datasketches/theta/Intersection.java
index a31dc3ef9..134c49ff6 100644
--- a/src/main/java/org/apache/datasketches/theta/Intersection.java
+++ b/src/main/java/org/apache/datasketches/theta/Intersection.java
@@ -20,23 +20,13 @@
package org.apache.datasketches.theta;
import static java.lang.foreign.ValueLayout.JAVA_BYTE;
-import static org.apache.datasketches.common.Util.floorPowerOf2;
-import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
-import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
-import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
-import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
-import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
import java.lang.foreign.MemorySegment;
-import java.util.Arrays;
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.common.Util;
-import org.apache.datasketches.thetacommon.ThetaUtil;
/**
* The API for intersection operations
@@ -164,84 +154,4 @@ public static Intersection wrap(final MemorySegment srcSeg, final long expectedS
return IntersectionImpl.wrapInstance(srcSeg, expectedSeed, srcSeg.isReadOnly() );
}
- // Restricted
-
- /**
- * Returns the maximum lgArrLongs given the capacity of the MemorySegment.
- * @param dstSeg the given MemorySegment
- * @return the maximum lgArrLongs given the capacity of the MemorySegment
- */
- protected static int getMaxLgArrLongs(final MemorySegment dstSeg) {
- final int preBytes = CONST_PREAMBLE_LONGS << 3;
- final long cap = dstSeg.byteSize();
- return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3);
- }
-
- protected static void checkMinSizeMemorySegment(final MemorySegment seg) {
- final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280
- final long cap = seg.byteSize();
- if (cap < minBytes) {
- throw new SketchesArgumentException(
- "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap);
- }
- }
-
- /**
- * Compact first 2^lgArrLongs of given array
- * @param srcCache anything
- * @param lgArrLongs The correct
- * lgArrLongs.
- * @param curCount must be correct
- * @param thetaLong The correct
- * thetaLong.
- * @param dstOrdered true if output array must be sorted
- * @return the compacted array
- */ //Only used in IntersectionImpl & Test
- static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs,
- final int curCount, final long thetaLong, final boolean dstOrdered) {
- if (curCount == 0) {
- return new long[0];
- }
- final long[] cacheOut = new long[curCount];
- final int len = 1 << lgArrLongs;
- int j = 0;
- for (int i = 0; i < len; i++) {
- final long v = srcCache[i];
- if (v <= 0L || v >= thetaLong ) { continue; }
- cacheOut[j++] = v;
- }
- assert curCount == j;
- if (dstOrdered) {
- Arrays.sort(cacheOut);
- }
- return cacheOut;
- }
-
- protected static void segChecks(final MemorySegment srcSeg) {
- //Get Preamble
- //Note: Intersection does not use lgNomLongs (or k), per se.
- //seedHash loaded and checked in private constructor
- final int preLongs = extractPreLongs(srcSeg);
- final int serVer = extractSerVer(srcSeg);
- final int famID = extractFamilyID(srcSeg);
- final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0;
- final int curCount = extractCurCount(srcSeg);
- //Checks
- if (preLongs != CONST_PREAMBLE_LONGS) {
- throw new SketchesArgumentException(
- "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs);
- }
- if (serVer != SER_VER) {
- throw new SketchesArgumentException("Serialization Version must equal " + SER_VER);
- }
- Family.INTERSECTION.checkFamilyID(famID);
- if (empty) {
- if (curCount != 0) {
- throw new SketchesArgumentException(
- "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount);
- }
- //empty = true AND curCount_ = 0: OK
- } //else empty = false, curCount could be anything
- }
-
}
diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
index ebb4a6215..6819524b1 100644
--- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
+++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
@@ -26,6 +26,7 @@
import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED;
import static java.lang.foreign.ValueLayout.JAVA_SHORT_UNALIGNED;
import static org.apache.datasketches.common.Util.clearBits;
+import static org.apache.datasketches.common.Util.floorPowerOf2;
import static org.apache.datasketches.common.Util.setBits;
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
@@ -41,8 +42,10 @@
import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG;
import static org.apache.datasketches.theta.PreambleUtil.clearEmpty;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
+import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs;
+import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.PreambleUtil.insertCurCount;
import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID;
@@ -81,17 +84,17 @@
* @author Kevin Lang
*/
final class IntersectionImpl extends Intersection {
- protected final short seedHash_;
- protected final boolean readOnly_; //True if this sketch is to be treated as read only
- protected final MemorySegment wseg_;
- protected final int maxLgArrLongs_; //only used with MemorySegment, not serialized
+ private final short seedHash_;
+ private final boolean readOnly_; //True if this sketch is to be treated as read only
+ private final MemorySegment wseg_;
+ private final int maxLgArrLongs_; //only used with MemorySegment, not serialized
//Note: Intersection does not use lgNomLongs or k, per se.
- protected int lgArrLongs_; //current size of hash table
- protected int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true
- protected long thetaLong_;
- protected boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE!
- protected long[] hashTable_; //retained entries of the intersection, on-heap only.
+ private int lgArrLongs_; //current size of hash table
+ private int curCount_; //curCount of HT, if < 0 means Universal Set (US) is true
+ private long thetaLong_;
+ private boolean empty_; //A virgin intersection represents the Universal Set, so empty is FALSE!
+ private long[] hashTable_; //retained entries of the intersection, on-heap only.
/**
* Constructor: Sets the class finals and computes, sets and checks the seedHash.
@@ -100,14 +103,14 @@ final class IntersectionImpl extends Intersection {
* @param dstSegFlag The given MemorySegment is a Destination (new offHeap) MemorySegment.
* @param readOnly True if MemorySegment is to be treated as read only.
*/
- protected IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag,
+ private IntersectionImpl(final MemorySegment wseg, final long seed, final boolean dstSegFlag,
final boolean readOnly) {
readOnly_ = readOnly;
if (wseg != null) {
wseg_ = wseg;
if (dstSegFlag) { //DstSeg: compute & store seedHash, no seedHash checking
- checkMinSizeMemorySegment(wseg);
- maxLgArrLongs_ = !readOnly ? getMaxLgArrLongs(wseg) : 0; //Only Off Heap
+ IntersectionImpl.checkMinSizeMemorySegment(wseg);
+ maxLgArrLongs_ = !readOnly ? IntersectionImpl.getMaxLgArrLongs(wseg) : 0; //Only Off Heap
seedHash_ = Util.computeSeedHash(seed);
wseg_.set(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT, seedHash_);
} else { //SrcSeg:gets and stores the seedHash, checks seg_seedHash against the seed
@@ -179,8 +182,8 @@ static IntersectionImpl initNewDirectInstance(final long seed, final MemorySegme
static IntersectionImpl heapifyInstance(final MemorySegment srcSeg, final long seed) {
final boolean dstSegFlag = false;
final boolean readOnly = false;
+ IntersectionImpl.segChecks(srcSeg);
final IntersectionImpl impl = new IntersectionImpl(null, seed, dstSegFlag, readOnly);
- segChecks(srcSeg);
//Initialize
impl.lgArrLongs_ = extractLgArrLongs(srcSeg);
@@ -207,8 +210,8 @@ static IntersectionImpl wrapInstance(
final long seed,
final boolean readOnly) {
final boolean dstSegFlag = false;
+ IntersectionImpl.segChecks(srcSeg);
final IntersectionImpl impl = new IntersectionImpl(srcSeg, seed, dstSegFlag, readOnly);
- segChecks(srcSeg);
impl.lgArrLongs_ = extractLgArrLongs(srcSeg);
impl.curCount_ = extractCurCount(srcSeg);
impl.thetaLong_ = extractThetaLong(srcSeg);
@@ -333,7 +336,7 @@ public CompactSketch getResult(final boolean dstOrdered, final MemorySegment dst
} else {
hashTable = hashTable_;
}
- compactCache = compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered);
+ compactCache = IntersectionImpl.compactCachePart(hashTable, lgArrLongs_, curCount_, thetaLong_, dstOrdered);
srcCompact = true;
srcOrdered = dstOrdered;
return CompactOperations.componentsToCompact(
@@ -561,4 +564,83 @@ private void resetCommon() {
thetaLong_ = Long.MAX_VALUE;
hashTable_ = null;
}
+
+ /**
+ * Compact first 2^lgArrLongs of given array
+ * @param srcCache anything
+ * @param lgArrLongs The correct
+ * lgArrLongs.
+ * @param curCount must be correct
+ * @param thetaLong The correct
+ * thetaLong.
+ * @param dstOrdered true if output array must be sorted
+ * @return the compacted array
+ */ //used in Test
+ static final long[] compactCachePart(final long[] srcCache, final int lgArrLongs,
+ final int curCount, final long thetaLong, final boolean dstOrdered) {
+ if (curCount == 0) {
+ return new long[0];
+ }
+ final long[] cacheOut = new long[curCount];
+ final int len = 1 << lgArrLongs;
+ int j = 0;
+ for (int i = 0; i < len; i++) {
+ final long v = srcCache[i];
+ if (v <= 0L || v >= thetaLong ) { continue; }
+ cacheOut[j++] = v;
+ }
+ assert curCount == j;
+ if (dstOrdered) {
+ Arrays.sort(cacheOut);
+ }
+ return cacheOut;
+ }
+
+ private static void checkMinSizeMemorySegment(final MemorySegment seg) {
+ final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << ThetaUtil.MIN_LG_ARR_LONGS);//280
+ final long cap = seg.byteSize();
+ if (cap < minBytes) {
+ throw new SketchesArgumentException(
+ "MemorySegment must be at least " + minBytes + " bytes. Actual capacity: " + cap);
+ }
+ }
+
+ /**
+ * Returns the maximum lgArrLongs given the capacity of the MemorySegment.
+ * @param dstSeg the given MemorySegment
+ * @return the maximum lgArrLongs given the capacity of the MemorySegment
+ */
+ private static int getMaxLgArrLongs(final MemorySegment dstSeg) {
+ final int preBytes = CONST_PREAMBLE_LONGS << 3;
+ final long cap = dstSeg.byteSize();
+ return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3);
+ }
+
+ private static void segChecks(final MemorySegment srcSeg) {
+ //Get Preamble
+ //Note: Intersection does not use lgNomLongs (or k), per se.
+ //seedHash loaded and checked in private constructor
+ final int preLongs = Sketch.getPreambleLongs(srcSeg);
+ final int serVer = extractSerVer(srcSeg);
+ final int famID = extractFamilyID(srcSeg);
+ final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) > 0;
+ final int curCount = extractCurCount(srcSeg);
+ //Checks
+ if (preLongs != CONST_PREAMBLE_LONGS) {
+ throw new SketchesArgumentException(
+ "MemorySegment PreambleLongs must equal " + CONST_PREAMBLE_LONGS + ": " + preLongs);
+ }
+ if (serVer != SER_VER) {
+ throw new SketchesArgumentException("Serialization Version must equal " + SER_VER);
+ }
+ Family.INTERSECTION.checkFamilyID(famID);
+ if (empty) {
+ if (curCount != 0) {
+ throw new SketchesArgumentException(
+ "srcSeg empty state inconsistent with curCount: " + empty + "," + curCount);
+ }
+ //empty = true AND curCount_ = 0: OK
+ } //else empty = false, curCount could be anything
+ }
+
}
diff --git a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java
index 548c79ef3..53344c8d6 100644
--- a/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java
+++ b/src/main/java/org/apache/datasketches/theta/MemorySegmentHashIterator.java
@@ -38,7 +38,7 @@ final class MemorySegmentHashIterator implements HashIterator {
this.seg = srcSeg;
this.arrLongs = arrLongs;
this.thetaLong = thetaLong;
- offsetBytes = PreambleUtil.extractPreLongs(srcSeg) << 3;
+ offsetBytes = Sketch.getPreambleLongs(srcSeg) << 3;
index = -1;
hash = 0;
}
diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
index 294682e04..19dec2061 100644
--- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
+++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
@@ -233,7 +233,7 @@ private PreambleUtil() {}
* @param preambleLongs current preamble size
* @return the size in bytes
*/
- static int getSegBytes(final int lgArrLongs, final int preambleLongs) {
+ static int getUpdatableSegBytes(final int lgArrLongs, final int preambleLongs) {
return (8 << lgArrLongs) + (preambleLongs << 3);
}
diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
index abf8df391..062c6d86d 100644
--- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
@@ -26,7 +26,6 @@
import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
-import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
@@ -385,7 +384,7 @@ short getSeedHash() {
}
static boolean otherCheckForSingleItem(final MemorySegment seg) {
- return otherCheckForSingleItem(extractPreLongs(seg), extractSerVer(seg),
+ return otherCheckForSingleItem(Sketch.getPreambleLongs(seg), extractSerVer(seg),
extractFamilyID(seg), extractFlags(seg) );
}
diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java
index 8ff9c13d0..bc944478d 100644
--- a/src/main/java/org/apache/datasketches/theta/Sketch.java
+++ b/src/main/java/org/apache/datasketches/theta/Sketch.java
@@ -20,16 +20,24 @@
package org.apache.datasketches.theta;
import static java.lang.foreign.ValueLayout.JAVA_BYTE;
+import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED;
+import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED;
import static org.apache.datasketches.common.Family.idToFamily;
import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE;
import static org.apache.datasketches.common.Util.LS;
import static org.apache.datasketches.common.Util.ceilingPowerOf2;
import static org.apache.datasketches.common.Util.zeroPad;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
+import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
-import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
+import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
+import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
+import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG;
+import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
+import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
+import static org.apache.datasketches.theta.PreambleUtil.getAndCheckPreLongs;
import static org.apache.datasketches.thetacommon.HashOperations.count;
import java.lang.foreign.MemorySegment;
@@ -70,12 +78,12 @@ public abstract class Sketch implements MemorySegmentStatus {
* @return a Sketch on the heap.
*/
public static Sketch heapify(final MemorySegment srcSeg) {
- final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE);
- final Family family = idToFamily(familyID);
- if (family == Family.COMPACT) {
- return CompactSketch.heapify(srcSeg);
+// return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED);
+ final int familyID = extractFamilyID(srcSeg);
+ if (familyID == Family.COMPACT.getID()) {
+ return CompactSketch.heapify(srcSeg);//, Util.DEFAULT_UPDATE_SEED);
}
- return heapifyUpdateFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED);
+ return heapifyUpdateSketchFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED);
}
/**
@@ -83,13 +91,14 @@ public static Sketch heapify(final MemorySegment srcSeg) {
*
* The resulting sketch will not retain any link to the source MemorySegment.
* - *For Update Sketches this method checks if the - * Default Update Seed
- * was used to create the source MemorySegment image. + *For Update Sketches this method checks if the expectedSeed + * was used to create the source MemorySegment image.
* *For Compact Sketches this method assumes that the sketch image was created with the * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.
* + *Note: This assumes only SerVer 3 and later.
+ * * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. @@ -102,7 +111,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed if (family == Family.COMPACT) { return CompactSketch.heapify(srcSeg, expectedSeed); } - return heapifyUpdateFromMemorySegment(srcSeg, expectedSeed); + return heapifyUpdateSketchFromMemorySegment(srcSeg, expectedSeed); } /** @@ -275,6 +284,20 @@ public int getCountLessThanThetaLong(final long thetaLong) { */ public abstract double getEstimate(); + /** + * Gets the estimate from the given MemorySegment + * @param srcSeg the given MemorySegment + * @return the result estimate + */ + public static double getEstimate(final MemorySegment srcSeg) { + final int famId = extractFamilyID(srcSeg); + if (!isValidSketchID(famId)) { + throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " + + Family.idToFamily(famId).toString()); + } + return Sketch.estimate(extractThetaLong(srcSeg), getRetainedEntries(srcSeg)); + } + /** * Returns the Family that this sketch belongs to * @return the Family that this sketch belongs to @@ -351,6 +374,30 @@ public int getRetainedEntries() { return getRetainedEntries(true); } + /** + * Returns the number of valid entries that have been retained by the sketch from the given MemorySegment + * @param srcSeg the given MemorySegment that has an image of a Sketch + * @return the number of valid retained entries + */ + public static int getRetainedEntries(final MemorySegment srcSeg) { + final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); + if (serVer == 1) { + final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); + if (Sketch.getThetaLong(srcSeg) == Long.MAX_VALUE && entries == 0) { + return 0; + } + return entries; + } + //SerVer 2 or 3 + final int preLongs = Sketch.getPreambleLongs(srcSeg); + final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 + if (preLongs == 1) { + return empty ? 0 : 1; + } + //preLongs > 1 + return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 + } + /** * Returns the number of entries that have been retained by the sketch. * @param valid if true, returns the number of valid entries, which are less than theta and used @@ -416,7 +463,7 @@ public double getUpperBound(final int numStdDev) { * @return true if the sketch is in estimation mode. */ public boolean isEstimationMode() { - return estMode(getThetaLong(), isEmpty()); + return getThetaLong() < Long.MAX_VALUE && !isEmpty(); } /** @@ -606,6 +653,23 @@ public static String toString(final MemorySegment seg) { */ abstract short getSeedHash(); + static boolean getEmpty(final MemorySegment srcSeg) { + final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); + if (serVer == 1) { + return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; + } + return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2,3,4 + } + + static int getPreambleLongs(final MemorySegment srcSeg) { + return getAndCheckPreLongs(srcSeg); //for SerVer 1,2,3,4 + } + + static long getThetaLong(final MemorySegment srcSeg) { + final int preLongs = Sketch.getPreambleLongs(srcSeg); + return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3,4 + } + /** * Returns true if given Family id is one of the theta sketches * @param id the given Family id @@ -617,44 +681,49 @@ static final boolean isValidSketchID(final int id) { || id == Family.COMPACT.getID(); } - /** - * Checks Ordered and Compact flags for integrity between sketch and a MemorySegment - * @param sketch the given sketch - */ - static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { - final MemorySegment seg = sketch.getMemorySegment(); - if (seg == null) { return; } - final int flags = PreambleUtil.extractFlags(seg); - if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Compact Flag inconsistent with Sketch"); - } - if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { - throw new SketchesArgumentException("Possible corruption: " - + "MemorySegment Ordered Flag inconsistent with Sketch"); - } - } - static final double estimate(final long thetaLong, final int curCount) { return curCount * (LONG_MAX_VALUE_AS_DOUBLE / thetaLong); } - static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, - final boolean empty) { + /** + * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the lower bound. + */ + public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { + return lowerBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + + static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getLowerBound(curCount, theta, numStdDev, empty); } + /** + * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch + * given the specified number of Standard Deviations. + * This will return getEstimate() if isEmpty() is true. + * + * @param numStdDev + * See Number of Standard Deviations + * @param srcSeg the source MemorySegment + * @return the upper bound. + */ + public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { + return upperBound(getRetainedEntries(srcSeg), Sketch.getThetaLong(srcSeg), numStdDev, Sketch.getEmpty(srcSeg)); + } + static final double upperBound(final int curCount, final long thetaLong, final int numStdDev, final boolean empty) { final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE; return BinomialBoundsN.getUpperBound(curCount, theta, numStdDev, empty); } - private static final boolean estMode(final long thetaLong, final boolean empty) { - return thetaLong < Long.MAX_VALUE && !empty; - } - /** * Instantiates a Heap Update Sketch from MemorySegment. Only SerVer3. SerVer 1 & 2 already handled. * @param srcSeg the source MemorySegment @@ -662,7 +731,7 @@ private static final boolean estMode(final long thetaLong, final boolean empty) * See Update Hash Seed. * @return a Sketch */ - private static final Sketch heapifyUpdateFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { + private static final Sketch heapifyUpdateSketchFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) { final long cap = srcSeg.byteSize(); if (cap < 8) { throw new SketchesArgumentException( diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java deleted file mode 100644 index 69d945c40..000000000 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ /dev/null @@ -1,406 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.datasketches.theta; - -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; -import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; - -import java.lang.foreign.MemorySegment; - -import org.apache.datasketches.common.Family; -import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.common.Util; - -/** - * This class brings together the common sketch and set operation creation methods and - * the public static methods into one place. - * - * @author Lee Rhodes - */ -public final class Sketches { - - private Sketches() {} - - /** - * Gets the unique count estimate from a valid MemorySegment image of a Sketch - * @param srcSeg the source MemorySegment - * @return the sketch's best estimate of the cardinality of the input stream. - */ - public static double getEstimate(final MemorySegment srcSeg) { - checkIfValidThetaSketch(srcSeg); - return Sketch.estimate(getThetaLong(srcSeg), getRetainedEntries(srcSeg)); - } - - /** - * Gets the approximate lower error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the lower bound. - */ - public static double getLowerBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.lowerBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxAnotBResultBytes(int)}. - * Returns the maximum number of bytes for the returned CompactSketch, given the maximum - * value of nomEntries of the first sketch A of AnotB. - * @param maxNomEntries the given value - * @return the maximum number of bytes. - */ - public static int getMaxAnotBResultBytes(final int maxNomEntries) { - return SetOperation.getMaxAnotBResultBytes(maxNomEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch with the given - * number of actual entries. - * @param numberOfEntries the actual number of retained entries stored in the sketch. - * @return the maximum number of storage bytes required for a CompactSketch with the given number - * of retained entries. - */ - public static int getMaxCompactSketchBytes(final int numberOfEntries) { - return Sketch.getMaxCompactSketchBytes(numberOfEntries); - } - - /** - * Returns the maximum number of storage bytes required for a CompactSketch given the configured - * log_base2 of the number of nominal entries, which is a power of 2. - * @param lgNomEntries Nominal Entries - * @return the maximum number of storage bytes required for a CompactSketch with the given - * lgNomEntries. - * @see Sketch#getCompactSketchMaxBytes(int) - */ - public static int getCompactSketchMaxBytes(final int lgNomEntries) { - return Sketch.getCompactSketchMaxBytes(lgNomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxIntersectionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxIntersectionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxIntersectionBytes(int)} - */ - public static int getMaxIntersectionBytes(final int nomEntries) { - return SetOperation.getMaxIntersectionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link SetOperation#getMaxUnionBytes(int)} - * @param nomEntries Ref: {@link SetOperation#getMaxUnionBytes(int)}, {@code nomEntries} - * @return Ref: {@link SetOperation#getMaxUnionBytes(int)} - */ - public static int getMaxUnionBytes(final int nomEntries) { - return SetOperation.getMaxUnionBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - * @param nomEntries Ref: {@link Sketch#getMaxUpdateSketchBytes(int)}, {@code nomEntries} - * @return Ref: {@link Sketch#getMaxUpdateSketchBytes(int)} - */ - public static int getMaxUpdateSketchBytes(final int nomEntries) { - return Sketch.getMaxUpdateSketchBytes(nomEntries); - } - - /** - * Convenience method, ref: {@link Sketch#getSerializationVersion(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#getSerializationVersion(MemorySegment)}, {@code srcSeg} - * @return Ref: {@link Sketch#getSerializationVersion(MemorySegment)} - */ - public static int getSerializationVersion(final MemorySegment srcSeg) { - return Sketch.getSerializationVersion(srcSeg); - } - - /** - * Gets the approximate upper error bound from a valid MemorySegment image of a Sketch - * given the specified number of Standard Deviations. - * This will return getEstimate() if isEmpty() is true. - * - * @param numStdDev - * See Number of Standard Deviations - * @param srcSeg the source MemorySegment - * @return the upper bound. - */ - public static double getUpperBound(final int numStdDev, final MemorySegment srcSeg) { - return Sketch.upperBound(getRetainedEntries(srcSeg), getThetaLong(srcSeg), numStdDev, getEmpty(srcSeg)); - } - - //Heapify Operations - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment) CompactSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#heapify(MemorySegment, long) CompactSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch heapifyCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment) CompactSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg) { - return CompactSketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed Ref: {@link CompactSketch#wrap(MemorySegment, long) CompactSketch.wrap(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link CompactSketch CompactSketch} - */ - public static CompactSketch wrapCompactSketch(final MemorySegment srcSeg, final long expectedSeed) { - return CompactSketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment) SetOperation.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg) { - return SetOperation.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#heapify(MemorySegment, long) SetOperation.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation heapifySetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment) Sketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg) { - return Sketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link Sketch#heapify(MemorySegment, long) Sketch.heapify(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch heapifySketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.heapify(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment) UpdateSketch.heapify(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg) { - return UpdateSketch.heapify(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#heapify(MemorySegment, long) UpdateSketch.heapify(MemorySegment, long)}, - * {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch heapifyUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.heapify(srcSeg, expectedSeed); - } - - //Builders - - /** - * Ref: {@link SetOperationBuilder SetOperationBuilder} - * @return {@link SetOperationBuilder SetOperationBuilder} - */ - public static SetOperationBuilder setOperationBuilder() { - return new SetOperationBuilder(); - } - - /** - * Ref: {@link UpdateSketchBuilder UpdateSketchBuilder} - * @return {@link UpdateSketchBuilder UpdateSketchBuilder} - */ - public static UpdateSketchBuilder updateSketchBuilder() { - return new UpdateSketchBuilder(); - } - - //Wrap operations - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Intersection backed by the given MemorySegment - */ - public static Intersection wrapIntersection(final MemorySegment srcSeg) { - return (Intersection) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment) SetOperation.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg) { - return wrapSetOperation(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link SetOperation#wrap(MemorySegment, long) SetOperation.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link SetOperation SetOperation} - */ - public static SetOperation wrapSetOperation(final MemorySegment srcSeg, final long expectedSeed) { - return SetOperation.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment) Sketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg) { - return Sketch.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the expectedSeed used to validate the given MemorySegment image. - * Ref: {@link Sketch#wrap(MemorySegment, long) Sketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link Sketch Sketch} - */ - public static Sketch wrapSketch(final MemorySegment srcSeg, final long expectedSeed) { - return Sketch.wrap(srcSeg, expectedSeed); - } - - /** - * Convenience method, ref: {@link SetOperation#wrap(MemorySegment)} and casts the result to a Union - * @param srcSeg Ref: {@link SetOperation#wrap(MemorySegment)}, {@code srcSeg} - * @return a Union backed by the given MemorySegment. - */ - public static Union wrapUnion(final MemorySegment srcSeg) { - return (Union) SetOperation.wrap(srcSeg); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment) UpdateSketch.wrap(MemorySegment)}, {@code srcSeg} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg) { - return wrapUpdateSketch(srcSeg, Util.DEFAULT_UPDATE_SEED); - } - - /** - * Convenience method, ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)} - * @param srcSeg Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code srcSeg} - * @param expectedSeed the seed used to validate the given MemorySegment image. - * Ref: {@link UpdateSketch#wrap(MemorySegment, long) UpdateSketch.wrap(MemorySegment, long)}, {@code expectedSeed} - * @return {@link UpdateSketch UpdateSketch} - */ - public static UpdateSketch wrapUpdateSketch(final MemorySegment srcSeg, final long expectedSeed) { - return UpdateSketch.wrap(srcSeg, null, expectedSeed); - } - - //Restricted static methods - - private static void checkIfValidThetaSketch(final MemorySegment srcSeg) { - final int fam = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - if (!Sketch.isValidSketchID(fam)) { - throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: " - + Family.idToFamily(fam).toString()); - } - } - - static boolean getEmpty(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; - } - return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - } - - static int getPreambleLongs(final MemorySegment srcSeg) { - return srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //for SerVer 1,2,3 - } - - static int getRetainedEntries(final MemorySegment srcSeg) { - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE); - if (serVer == 1) { - final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); - if (getThetaLong(srcSeg) == Long.MAX_VALUE && entries == 0) { - return 0; - } - return entries; - } - //SerVer 2 or 3 - final int preLongs = getPreambleLongs(srcSeg); - final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 - if (preLongs == 1) { - return empty ? 0 : 1; - } - //preLongs > 1 - return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 - } - - static long getThetaLong(final MemorySegment srcSeg) { - final int preLongs = getPreambleLongs(srcSeg); - return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3 - } -} diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java index d921ec1ba..bbefd958c 100644 --- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java +++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java @@ -22,6 +22,8 @@ import static java.lang.Math.min; import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.common.QuickSelect.selectExcludingZeros; +import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.UNION_THETA_LONG; import static org.apache.datasketches.theta.PreambleUtil.clearEmpty; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; @@ -34,6 +36,7 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.ResizeFactor; +import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; import org.apache.datasketches.thetacommon.HashOperations; @@ -279,7 +282,7 @@ public void union(final Sketch sketchIn) { gadget_.hashUpdate(sketchIn.getCache()[0]); return; } - Sketch.checkSketchAndMemorySegmentFlags(sketchIn); + UnionImpl.checkSketchAndMemorySegmentFlags(sketchIn); unionThetaLong_ = min(min(unionThetaLong_, sketchIn.getThetaLong()), gadget_.getThetaLong()); //Theta rule unionEmpty_ = false; @@ -372,4 +375,22 @@ boolean isEmpty() { return gadget_.isEmpty() && unionEmpty_; } + /** + * Checks Ordered and Compact flags for integrity between sketch and its MemorySegment + * @param sketch the given sketch + */ + private static final void checkSketchAndMemorySegmentFlags(final Sketch sketch) { + final MemorySegment seg = sketch.getMemorySegment(); + if (seg == null) { return; } + final int flags = PreambleUtil.extractFlags(seg); + if ((flags & COMPACT_FLAG_MASK) > 0 ^ sketch.isCompact()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Compact Flag inconsistent with Sketch"); + } + if ((flags & ORDERED_FLAG_MASK) > 0 ^ sketch.isOrdered()) { + throw new SketchesArgumentException("Possible corruption: " + + "MemorySegment Ordered Flag inconsistent with Sketch"); + } + } + } diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java index 8fe93e46c..4cd3a4cd4 100644 --- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java +++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java @@ -39,7 +39,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractP; import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.PreambleUtil.getSegBytes; +import static org.apache.datasketches.theta.PreambleUtil.getUpdatableSegBytes; import static org.apache.datasketches.theta.UpdateReturnState.RejectedNullOrEmpty; import java.lang.foreign.MemorySegment; @@ -162,8 +162,16 @@ public static UpdateSketch heapify(final MemorySegment srcSeg, final long expect @Override public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstWSeg) { - return componentsToCompact(getThetaLong(), getRetainedEntries(true), getSeedHash(), isEmpty(), - false, false, dstOrdered, dstWSeg, getCache()); + return componentsToCompact( + getThetaLong(), + getRetainedEntries(true), + getSeedHash(), + isEmpty(), + false, //is src compact + false, //is src ordered + dstOrdered, + dstWSeg, + getCache()); } @Override @@ -463,7 +471,7 @@ static void checkSegIntegrity(final MemorySegment srcSeg, final long expectedSee //Check seg capacity, lgArrLongs final long curCapBytes = srcSeg.byteSize(); - final int minReqBytes = getSegBytes(lgArrLongs, preambleLongs); + final int minReqBytes = getUpdatableSegBytes(lgArrLongs, preambleLongs); if (curCapBytes < minReqBytes) { throw new SketchesArgumentException( "Possible corruption: Current MemorySegment size < min required size: " diff --git a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java index e400dd1fa..6482712e8 100644 --- a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java +++ b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java @@ -25,16 +25,10 @@ import static org.testng.Assert.fail; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.AnotB; -import org.apache.datasketches.theta.AnotBimpl; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; import org.testng.annotations.Test; /** @@ -295,19 +289,18 @@ public void checkAnotBnotC_sameMemorySegment() { @Test public void checkAnotBsimple() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); - final UpdateSketch skB = Sketches.updateSketchBuilder().build(); - final AnotB aNotB = Sketches.setOperationBuilder().buildANotB(); + final UpdateSketch skA = UpdateSketch.builder().build(); + final UpdateSketch skB =UpdateSketch.builder().build(); + final AnotB aNotB = SetOperation.builder().buildANotB(); final CompactSketch csk = aNotB.aNotB(skA, skB); assertEquals(csk.getCurrentBytes(), 8); } @Test public void checkGetResult() { - final UpdateSketch skA = Sketches.updateSketchBuilder().build(); - final UpdateSketch skB = Sketches.updateSketchBuilder().build(); - - final AnotB aNotB = Sketches.setOperationBuilder().buildANotB(); + final UpdateSketch skA = UpdateSketch.builder().build(); + final UpdateSketch skB = UpdateSketch.builder().build(); + final AnotB aNotB = SetOperation.builder().buildANotB(); final CompactSketch csk = aNotB.aNotB(skA, skB); assertEquals(csk.getCurrentBytes(), 8); } @@ -321,7 +314,7 @@ public void checkGetFamily() { @Test public void checkGetMaxBytes() { - final int bytes = Sketches.getMaxAnotBResultBytes(10); + final int bytes = SetOperation.getMaxAnotBResultBytes(10); assertEquals(bytes, 16 * 15 + 24); } diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java index 8ef889be4..fc35891b3 100644 --- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java @@ -26,25 +26,13 @@ import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; +import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.DirectCompactSketch; -import org.apache.datasketches.theta.EmptyCompactSketch; -import org.apache.datasketches.theta.HashIterator; -import org.apache.datasketches.theta.HeapCompactSketch; -import org.apache.datasketches.theta.Intersection; -import org.apache.datasketches.theta.SingleItemSketch; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.WrappedCompactCompressedSketch; -import org.apache.datasketches.theta.WrappedCompactSketch; import org.testng.annotations.Test; -import java.lang.foreign.Arena; - /** * @author Lee Rhodes */ @@ -186,7 +174,7 @@ private static void checkOtherCompactSketch(final Sketch testSk, final Sketch re @Test public void checkDirectSingleItemSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final int bytes = sk.getCompactBytes(); final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); @@ -230,7 +218,7 @@ public void checkSegTooSmallOrdered() { @Test public void checkCompactCachePart() { //phony values except for curCount = 0. - final long[] result = Intersection.compactCachePart(null, 4, 0, 0L, false); + final long[] result = IntersectionImpl.compactCachePart(null, 4, 0, 0L, false); assertEquals(result.length, 0); } @@ -250,7 +238,7 @@ public void checkCompactCachePart() { * Empty, segment-based Compact sketches are always ordered */ public void checkEmptyMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]); final CompactSketch csk1 = sk.compact(false, wseg1); //the first parameter is ignored when empty @@ -290,7 +278,7 @@ public void checkEmptyMemorySegmentCompactSketch() { * Single-Item, segment-based Compact sketches are always ordered: */ public void checkSingleItemMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final MemorySegment wseg1 = MemorySegment.ofArray(new byte[16]); @@ -321,7 +309,7 @@ public void checkSingleItemMemorySegmentCompactSketch() { @Test public void checkMultipleItemMemorySegmentCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); //This sequence is naturally out-of-order by the hash values. sk.update(1); sk.update(2); @@ -360,7 +348,7 @@ public void checkMultipleItemMemorySegmentCompactSketch() { * All empty, heap-based, compact sketches point to the same static, final constant of 8 bytes. */ public void checkEmptyHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when empty final State state1 = new State("EmptyCompactSketch", 0, 8, COMPACT, EMPTY, !DIRECT, !SEGMENT, ORDERED, !ESTIMATION); @@ -390,7 +378,7 @@ public void checkEmptyHeapCompactSketch() { * Single-Item, heap-based Compact sketches are always ordered. */ public void checkSingleItemHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); final CompactSketch csk1 = sk.compact(false, null); //the first parameter is ignored when single item @@ -418,7 +406,7 @@ public void checkSingleItemHeapCompactSketch() { @Test public void checkMultipleItemHeapCompactSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); //This sequence is naturally out-of-order by the hash values. sk.update(1); sk.update(2); @@ -453,9 +441,9 @@ public void checkMultipleItemHeapCompactSketch() { @Test public void checkHeapifySingleItemSketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); - final int bytes = Sketches.getMaxCompactSketchBytes(2); //1 more than needed + final int bytes = Sketch.getMaxCompactSketchBytes(2); //1 more than needed final MemorySegment wseg = MemorySegment.ofArray(new byte[bytes]); sk.compact(false, wseg); final Sketch csk = Sketch.heapify(wseg); @@ -464,7 +452,7 @@ public void checkHeapifySingleItemSketch() { @Test public void checkHeapifyEmptySketch() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final MemorySegment wseg = MemorySegment.ofArray(new byte[16]); //empty, but extra bytes final CompactSketch csk = sk.compact(false, wseg); //ignores order because it is empty assertTrue(csk instanceof DirectCompactSketch); @@ -474,7 +462,7 @@ public void checkHeapifyEmptySketch() { @Test public void checkGetCache() { - final UpdateSketch sk = Sketches.updateSketchBuilder().setP((float).5).build(); + final UpdateSketch sk = UpdateSketch.builder().setP((float).5).build(); sk.update(7); final int bytes = sk.getCompactBytes(); final CompactSketch csk = sk.compact(true, MemorySegment.ofArray(new byte[bytes])); @@ -484,7 +472,7 @@ public void checkGetCache() { @Test public void checkHeapCompactSketchCompact() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); sk.update(1); sk.update(2); final CompactSketch csk = sk.compact(); @@ -506,7 +494,7 @@ public void checkDirectCompactSketchCompact() { final int lgK = 6; //empty - final UpdateSketch sk = Sketches.updateSketchBuilder().setLogNominalEntries(lgK).build(); + final UpdateSketch sk = UpdateSketch.builder().setLogNominalEntries(lgK).build(); bytes = sk.getCompactBytes(); //empty, 8 bytes wseg1 = MemorySegment.ofArray(new byte[bytes]); wseg2 = MemorySegment.ofArray(new byte[bytes]); @@ -566,7 +554,7 @@ public void checkDirectCompactSketchCompact() { @Test public void serializeDeserializeHeapV4() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -583,7 +571,7 @@ public void serializeDeserializeHeapV4() { @Test public void serializeDeserializeDirectV4_segment() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -600,7 +588,7 @@ public void serializeDeserializeDirectV4_segment() { @Test public void serializeDeserializeDirectV4_bytes() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -618,7 +606,7 @@ public void serializeDeserializeDirectV4_bytes() { @Test public void serializeWrapBytesV3() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } @@ -636,7 +624,7 @@ public void serializeWrapBytesV3() { @Test public void serializeWrapBytesV4() { - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); for (int i = 0; i < 10000; i++) { sk.update(i); } diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java index 28f78ac18..e0816b0e5 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentDirectQuickSelectSketchTest.java @@ -33,15 +33,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.ConcurrentDirectQuickSelectSketch; -import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer; -import org.apache.datasketches.theta.ConcurrentSharedThetaSketch; -import org.apache.datasketches.theta.DirectQuickSelectSketch; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.UpdateSketchBuilder; import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketchTest.SharedLocal; import org.apache.datasketches.thetacommon.HashOperations; import org.testng.annotations.Test; @@ -79,7 +70,7 @@ public void checkHeapifyMemorySegmentEstimating() { assertEquals(local.getClass().getSimpleName(), "ConcurrentHeapThetaBuffer"); //This sharedHeap is not linked to the concurrent local buffer - final UpdateSketch sharedHeap = Sketches.heapifyUpdateSketch(sl.wseg); + final UpdateSketch sharedHeap = UpdateSketch.heapify(sl.wseg); assertEquals(sharedHeap.getClass().getSimpleName(), "HeapQuickSelectSketch"); checkMemorySegmentDirectProxyMethods(local, shared); @@ -509,7 +500,7 @@ public void checkConstructReconstructFromMemorySegment() { serArr = shared.toByteArray(); final MemorySegment seg = MemorySegment.ofArray(serArr); - final UpdateSketch recoveredShared = Sketches.wrapUpdateSketch(seg); + final UpdateSketch recoveredShared = UpdateSketch.wrap(seg); //reconstruct to Native/Direct final int bytes = Sketch.getMaxUpdateSketchBytes(k); diff --git a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java index 7ba11c1c9..c354fd344 100644 --- a/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java +++ b/src/test/java/org/apache/datasketches/theta/ConcurrentHeapQuickSelectSketchTest.java @@ -34,17 +34,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.ConcurrentHeapQuickSelectSketch; -import org.apache.datasketches.theta.ConcurrentHeapThetaBuffer; -import org.apache.datasketches.theta.ConcurrentPropagationService; -import org.apache.datasketches.theta.ConcurrentSharedThetaSketch; -import org.apache.datasketches.theta.HeapQuickSelectSketch; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.UpdateSketch; -import org.apache.datasketches.theta.UpdateSketchBuilder; import org.testng.annotations.Test; /** @@ -173,7 +162,7 @@ public void checkHeapifyByteArrayExact() { final byte[] serArr = shared.toByteArray(); final MemorySegment srcSeg = MemorySegment.ofArray(serArr).asReadOnly(); - final Sketch recoveredShared = Sketches.heapifyUpdateSketch(srcSeg); + final Sketch recoveredShared = UpdateSketch.heapify(srcSeg); //reconstruct to Native/Direct final int bytes = Sketch.getMaxUpdateSketchBytes(k); diff --git a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java index e812ab8f2..59b6396b7 100644 --- a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java +++ b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java @@ -31,19 +31,12 @@ import static org.testng.Assert.fail; import java.lang.foreign.MemorySegment; + import org.apache.datasketches.common.Family; -import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.Intersection; -import org.apache.datasketches.theta.IntersectionImpl; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketches; -import org.apache.datasketches.theta.Union; -import org.apache.datasketches.theta.UpdateSketch; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesReadOnlyException; import org.apache.datasketches.common.SketchesStateException; +import org.apache.datasketches.common.Util; import org.testng.annotations.Test; /** @@ -471,7 +464,7 @@ public void checkWrapVirginEmpty() { MemorySegment iSeg = MemorySegment.ofArray(new byte[segBytes]); inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin off-heap - inter2 = Sketches.wrapIntersection(iSeg); //virgin off-heap, identical to inter1 + inter2 = Intersection.wrap(iSeg); //virgin off-heap, identical to inter1 //both in virgin state, empty = false //note: both inter1 and inter2 are tied to the same MemorySegment, // so an intersect to one also affects the other. Don't do what I do! @@ -493,7 +486,7 @@ public void checkWrapVirginEmpty() { //test the path via toByteArray, now in a different state iSeg = MemorySegment.ofArray(inter1.toByteArray()); - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); assertTrue(inter2.hasResult()); //still true //test the compaction path @@ -514,7 +507,7 @@ public void checkWrapNullEmpty2() { final MemorySegment iSeg = MemorySegment.ofArray(segArr); inter1 = SetOperation.builder().buildIntersection(iSeg); //virgin - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); //both in virgin state, empty = false assertFalse(inter1.hasResult()); assertFalse(inter2.hasResult()); @@ -525,7 +518,7 @@ public void checkWrapNullEmpty2() { //remains empty = false. inter1.intersect(sk1); - inter2 = Sketches.wrapIntersection(iSeg); + inter2 = Intersection.wrap(iSeg); assertTrue(inter1.hasResult()); assertTrue(inter2.hasResult()); final CompactSketch comp = inter2.getResult(true, null); @@ -579,7 +572,7 @@ public void checkBadPreambleLongs() { final MemorySegment seg = MemorySegment.ofArray(byteArray); //corrupt: seg.set(JAVA_BYTE, PREAMBLE_LONGS_BYTE, (byte) 2);//RF not used = 0 - Sketches.wrapIntersection(seg); + Intersection.wrap(seg); } @Test(expectedExceptions = SketchesArgumentException.class) @@ -596,18 +589,19 @@ public void checkBadSerVer() { final MemorySegment seg = MemorySegment.ofArray(byteArray); //corrupt: seg.set(JAVA_BYTE, SER_VER_BYTE, (byte) 2); - Sketches.wrapIntersection(seg); //throws in SetOperations + Intersection.wrap(seg); //throws in SetOperations } - @Test(expectedExceptions = ClassCastException.class) - public void checkFamilyID() { + @Test(expectedExceptions = SketchesArgumentException.class) + public void checkIncorrectWrap() { final int k = 32; Union union; union = SetOperation.builder().setNominalEntries(k).buildUnion(); final byte[] byteArray = union.toByteArray(); final MemorySegment seg = MemorySegment.ofArray(byteArray); - Sketches.wrapIntersection(seg); + Intersection.wrap(seg); //wrong sketch Family + //Sketches.wrapIntersection(seg); } @Test @@ -637,7 +631,7 @@ public void checkWrap() { final byte[] segArr2 = inter.toByteArray(); final MemorySegment srcSeg = MemorySegment.ofArray(segArr2); - inter2 = Sketches.wrapIntersection(srcSeg); + inter2 = Intersection.wrap(srcSeg); //2nd call = valid intersecting sk2 = UpdateSketch.builder().setNominalEntries(k).build(); @@ -656,7 +650,7 @@ public void checkWrap() { final byte[] segArr3 = inter2.toByteArray(); final MemorySegment srcSeg2 = MemorySegment.ofArray(segArr3); - inter3 = Sketches.wrapIntersection(srcSeg2); + inter3 = Intersection.wrap(srcSeg2); resultComp2 = inter3.getResult(false, null); est2 = resultComp2.getEstimate(); println("Est2: "+est2); @@ -683,13 +677,13 @@ public void checkExceptionMinSize() { @Test public void checkGetResult() { final int k = 1024; - final UpdateSketch sk = Sketches.updateSketchBuilder().build(); + final UpdateSketch sk = UpdateSketch.builder().build(); final int segBytes = getMaxIntersectionBytes(k); final byte[] segArr = new byte[segBytes]; final MemorySegment iSeg = MemorySegment.ofArray(segArr); - final Intersection inter = Sketches.setOperationBuilder().buildIntersection(iSeg); + final Intersection inter = SetOperation.builder().buildIntersection(iSeg); inter.intersect(sk); final CompactSketch csk = inter.getResult(); assertEquals(csk.getCompactBytes(), 8); @@ -732,8 +726,8 @@ public void checkExceptions2() { public void checkOverlappedDirect() { final int k = 1 << 4; final int segBytes = 2*k*16 +PREBYTES; //plenty of room - final UpdateSketch sk1 = Sketches.updateSketchBuilder().setNominalEntries(k).build(); - final UpdateSketch sk2 = Sketches.updateSketchBuilder().setNominalEntries(k).build(); + final UpdateSketch sk1 = UpdateSketch.builder().setNominalEntries(k).build(); + final UpdateSketch sk2 = UpdateSketch.builder().setNominalEntries(k).build(); for (int i=0; iV1 dates from roughly Aug 2014 to about May 2015. - * The library at that time had an early Theta sketch with set operations based on ByteBuffer, - * the Alpha sketch, and an early HLL sketch. It also had an early adaptor for Pig. - * It also had code for the even earlier CountUniqueSketch (for backward compatibility), - * which was the bucket sketch based on Giroire. - * - *
Serialization Version 1:
- *- * Long || Start Byte Adr: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || | Flags | LgResize | LgArr | lgNom | SkType | SerVer | MD_LONGS | - * - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 || | ------------CurCount-------------- | - * - * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | - * 2 || --------------------------THETA_LONG------------------------------ | - * - * || | 24 | - * 3 || ----------------------Start of Long Array------------------------ | - *- * - *
V2 is short-lived and dates from roughly Mid May 2015 to about June 1st, 2015. - * (V3 was created about June 15th in preparation for OpenSource in July.) - * The Theta sketch had evolved but still based on ByteBuffer. There was an UpdateSketch, - * the Alpha sketch, and the early HLL sketch. It also had an early adaptor for Pig. - * - * - *
Serialization Version 2:
- *- * Long || Start Byte Adr: - * Adr: - * || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | - * 0 || Seed Hash | Flags | lgArr | lgNom | SkType | SerVer | MD_LONGS + RR | - * - * || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 | - * 1 || --------------p-------------- | ---------Retained Entries Count-------- | - * - * || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 | - * 2 || --------------------------THETA_LONG----------------------------------- | - * - * || | 24 | - * 3 || ----------Start of Long Array, could be at 2 or 3 -------------------- | - *- * - *
- * // Metadata byte Addresses
- * private static final int METADATA_LONGS_BYTE = 0; //low 6 bits
- * private static final int LG_RESIZE_RATIO_BYTE = 0; //upper 2 bits
- * private static final int SER_VER_BYTE = 1;
- * private static final int SKETCH_TYPE_BYTE = 2;
- * private static final int LG_NOM_LONGS_BYTE = 3;
- * private static final int LG_ARR_LONGS_BYTE = 4;
- * private static final int FLAGS_BYTE = 5;
- * private static final int SEED_HASH_SHORT = 6; //byte 6,7
- * private static final int RETAINED_ENTRIES_COUNT_INT = 8; //4 byte aligned
- * private static final int P_FLOAT = 12; //4 byte aligned
- * private static final int THETA_LONG = 16; //8-byte aligned
- * //Backward compatibility
- * private static final int FLAGS_BYTE_V1 = 6;
- * private static final int LG_RESIZE_RATIO_BYTE_V1 = 5;
- *
- * // Constant Values
- * static final int SER_VER = 2;
- * static final int ALPHA_SKETCH = 1; //SKETCH_TYPE_BYTE
- * static final int QUICK_SELECT_SKETCH = 2;
- * static final int SET_SKETCH = 3;
- * static final int BUFFERED_QUICK_SELECT_SKETCH = 4;
- * static final String[] SKETCH_TYPE_STR =
- * { "None", "AlphaSketch", "QuickSelectSketch", "SetSketch", "BufferedQuickSelectSketch" };
- *
- * // flag bit masks
- * static final int BIG_ENDIAN_FLAG_MASK = 1;
- * static final int READ_ONLY_FLAG_MASK = 2;
- * static final int EMPTY_FLAG_MASK = 4;
- * static final int NO_REBUILD_FLAG_MASK = 8;
- * static final int UNORDERED_FLAG_MASK = 16;
- *
- *
- * @param skV3 a SerVer3, ordered CompactSketch
- * @param seed used for checking the seed hash (if one exists).
- * @return a SerVer2 SetSketch as MemorySegment object.
- */
- public static MemorySegment convertSerVer3toSerVer2(final CompactSketch skV3, final long seed) {
- final short seedHash = Util.computeSeedHash(seed);
- MemorySegment wseg = null;
-
- if (skV3 instanceof EmptyCompactSketch) {
- wseg = MemorySegment.ofArray(new long[1]);
- wseg.set(JAVA_BYTE, 0, (byte) 1); //preLongs
- wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer
- wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch
- final byte flags = (byte) 0xE; //NoRebuild, Empty, ReadOnly, LE
- wseg.set(JAVA_BYTE, 5, flags);
- wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash);
- return wseg;
- }
- if (skV3 instanceof SingleItemSketch) {
- final SingleItemSketch sis = (SingleItemSketch) skV3;
- wseg = MemorySegment.ofArray(new long[3]);
- wseg.set(JAVA_BYTE, 0, (byte) 2); //preLongs
- wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer
- wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch
- final byte flags = (byte) 0xA; //NoRebuild, notEmpty, ReadOnly, LE
- wseg.set(JAVA_BYTE, 5, flags);
- wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash);
- wseg.set(JAVA_INT_UNALIGNED, 8, 1);
- final long[] arr = sis.getCache();
- wseg.set(JAVA_LONG_UNALIGNED, 16, arr[0]);
- return wseg;
- }
- //General CompactSketch
- final int preLongs = skV3.getCompactPreambleLongs();
- final int entries = skV3.getRetainedEntries(true);
- final boolean unordered = !(skV3.isOrdered());
- final byte flags = (byte) (0xA | (unordered ? 16 : 0)); //Unordered, NoRebuild, notEmpty, ReadOnly, LE
- wseg = MemorySegment.ofArray(new byte[(preLongs + entries) << 3]);
- wseg.set(JAVA_BYTE, 0, (byte) preLongs); //preLongs
- wseg.set(JAVA_BYTE, 1, (byte) 2); //SerVer
- wseg.set(JAVA_BYTE, 2, (byte) 3); //SetSketch
-
- wseg.set(JAVA_BYTE, 5, flags);
- wseg.set(JAVA_SHORT_UNALIGNED, 6, seedHash);
- wseg.set(JAVA_INT_UNALIGNED, 8, entries);
- if (preLongs == 3) {
- wseg.set(JAVA_LONG_UNALIGNED, 16, skV3.getThetaLong());
- }
- final long[] arr = skV3.getCache();
- MemorySegment.copy(arr, 0, wseg, JAVA_LONG_UNALIGNED, preLongs << 3, entries);
- return wseg;
- }
-}
diff --git a/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java b/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java
index c0bcbb0e4..e83651aed 100644
--- a/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java
+++ b/src/test/java/org/apache/datasketches/theta/DirectUnionTest.java
@@ -21,8 +21,6 @@
import static java.lang.foreign.ValueLayout.JAVA_BYTE;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer1;
-import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer2;
import static org.apache.datasketches.theta.HeapUnionTest.testAllCompactForms;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
import static org.apache.datasketches.theta.SetOperation.getMaxUnionBytes;
@@ -32,7 +30,6 @@
import java.lang.foreign.MemorySegment;
import java.nio.ByteBuffer;
-import java.util.Arrays;
import org.apache.datasketches.common.Family;
import org.apache.datasketches.common.SketchesArgumentException;
@@ -451,105 +448,6 @@ public void checkDirectSegmentIn() {
assertEquals(cOut.getEstimate(), totU, .05*k);
}
- @Test
- public void checkSerVer1Handling() {
- final int lgK = 12; //4096
- final int k = 1 << lgK;
- final int u1 = 2*k;
- final int u2 = 1024; //smaller exact sketch forces early stop
- final int totU = u1+u2;
-
- final UpdateSketch usk1 = UpdateSketch.builder().setNominalEntries(k).build();
- final UpdateSketch usk2 = UpdateSketch.builder().setNominalEntries(k).build();
-
- for (int i=0; iThe resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.
* - *This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * The resulting on-heap CompactSketch will be given the seedHash derived from the given sketch image. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.
+ *The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.
* * @param srcSeg an image of a CompactSketch. * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg) { - //final boolean checkSeedHash = extractSerVer(srcSeg) != 1; - return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED, false); //false for SerVer 1 only + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -87,9 +83,7 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { *The resulting sketch will not retain any link to the source MemorySegment and all of its data will be * copied to the heap CompactSketch.
* - *This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketch images cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.
+ *This method checks if the given expectedSeed was used to create the source MemorySegment image.
* * @param srcSeg an image of a CompactSketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -97,10 +91,6 @@ public static CompactSketch heapify(final MemorySegment srcSeg) { * @return a CompactSketch on the heap. */ public static CompactSketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - return heapify(srcSeg, expectedSeed, true); - } - - private static CompactSketch heapify(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = idToFamily(familyID); @@ -108,17 +98,18 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } if (serVer == 4) { - return heapifyV4(srcSeg, seed, enforceSeed); + return heapifyV4(srcSeg, expectedSeed); } if (serVer == 3) { final int flags = extractFlags(srcSeg); final boolean srcOrdered = (flags & ORDERED_FLAG_MASK) != 0; final boolean empty = (flags & EMPTY_FLAG_MASK) != 0; - if (enforceSeed && !empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + if (!empty) { PreambleUtil.checkSegmentSeedHash(srcSeg, expectedSeed); } return CompactOperations.segmentToCompact(srcSeg, srcOrdered, null); } //not SerVer 3 or 4 - throw new SketchesArgumentException("Unknown Serialization Version: " + serVer); + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** @@ -126,24 +117,17 @@ private static CompactSketch heapify(final MemorySegment srcSeg, final long seed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
- * *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method assumes that the sketch image was created with the correct hash seed, so it is not checked. - * However, Serial Version 1 sketch images do not have a seedHash field, - * so the resulting on-heap CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.
+ *The {@link Util#DEFAULT_UPDATE_SEED DEFAULT_UPDATE_SEED} is assumed.
* * @param srcSeg an image of a Sketch. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg) { - return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED, false); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -151,69 +135,54 @@ public static CompactSketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
- * *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method checks if the given expectedSeed was used to create the source MemorySegment image. - * However, SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.
+ *This method checks if the given expectedSeed was used to create the source MemorySegment image.
* * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. - * @return a CompactSketch backed by the given MemorySegment except as above. + * @return a CompactSketch backed by the given MemorySegment. */ public static CompactSketch wrap(final MemorySegment srcSeg, final long expectedSeed) { - return wrap(srcSeg, expectedSeed, true); - } - - private static CompactSketch wrap(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { final int serVer = extractSerVer(srcSeg); final int familyID = extractFamilyID(srcSeg); final Family family = Family.idToFamily(familyID); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); + final short seedHash = Util.computeSeedHash(expectedSeed); + - switch (serVer) { - case 3: { - if (PreambleUtil.isEmptyFlag(srcSeg)) { - return EmptyCompactSketch.getHeapInstance(srcSeg); - } - if (otherCheckForSingleItem(srcSeg)) { - return SingleItemSketch.heapify(srcSeg, enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); - } - //not empty & not singleItem - final int flags = extractFlags(srcSeg); - final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; - if (!compactFlag) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have compact flag set"); - } - final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; - if (!readOnly) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have Read-Only flag set"); - } - return DirectCompactSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + if (serVer == 3) { + if (PreambleUtil.isEmptyFlag(srcSeg)) { + return EmptyCompactSketch.getHeapInstance(srcSeg); + } + if (otherCheckForSingleItem(srcSeg)) { + return SingleItemSketch.heapify(srcSeg, seedHash); } - case 4: { - return DirectCompactCompressedSketch.wrapInstance(srcSeg, - enforceSeed ? seedHash : (short) extractSeedHash(srcSeg)); + //not empty & not singleItem + final int flags = extractFlags(srcSeg); + final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; + if (!compactFlag) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have compact flag set"); } - default: { + final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; + if (!readOnly) { throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } + return DirectCompactSketch.wrapInstance(srcSeg, seedHash); } + if (serVer == 4) { + return DirectCompactCompressedSketch.wrapInstance(srcSeg, seedHash); + } + //not SerVer 3 or 4 + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); } /** @@ -278,38 +247,38 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo } final short seedHash = Util.computeSeedHash(seed); - switch (serVer) { - case 3: { - final int flags = bytes[FLAGS_BYTE]; - if ((flags & EMPTY_FLAG_MASK) > 0) { - return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); - } - final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); - } - //not empty & not singleItem - final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; - if (!compactFlag) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have compact flag set"); - } - final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; - if (!readOnly) { - throw new SketchesArgumentException( - "Corrupted: COMPACT family sketch image must have Read-Only flag set"); - } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + + if (serVer == 3) { + final int flags = bytes[FLAGS_BYTE]; + if ((flags & EMPTY_FLAG_MASK) > 0) { + return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); + } + final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; + if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); } - case 4: { - return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); + //not empty & not singleItem + final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; + if (!compactFlag) { + throw new SketchesArgumentException( + "Corrupted: COMPACT family sketch image must have compact flag set"); } - default: { + final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0; + if (!readOnly) { throw new SketchesArgumentException( - "Corrupted: Serialization Version " + serVer + " not recognized."); + "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } + return WrappedCompactSketch.wrapInstance(bytes, + enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + } + if (serVer ==4) { + return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); } + //not SerVer 3 or 4 + throw new SketchesArgumentException( + "Corrupted: Serialization Version " + serVer + " not recognized."); + + } //Sketch Overrides @@ -436,12 +405,12 @@ private byte[] toByteArrayV4() { return bytes; } - private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed, final boolean enforceSeed) { + private static CompactSketch heapifyV4(final MemorySegment srcSeg, final long seed) { final int preLongs = Sketch.getPreambleLongs(srcSeg); final int entryBits = extractEntryBitsV4(srcSeg); final int numEntriesBytes = extractNumEntriesBytesV4(srcSeg); final short seedHash = (short) extractSeedHash(srcSeg); - if (enforceSeed) { PreambleUtil.checkSegmentSeedHash(srcSeg, seed); } + PreambleUtil.checkSegmentSeedHash(srcSeg, seed); int offsetBytes = 8; long theta = Long.MAX_VALUE; if (preLongs > 1) { diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index 19dec2061..a95ebaaf6 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -190,10 +190,10 @@ private PreambleUtil() {} // ###### DO NOT MESS WITH THIS FROM HERE ... // Preamble byte Addresses - static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte. - static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte. Not used by compact, direct + static final int PREAMBLE_LONGS_BYTE = 0; //lower 6 bits in byte 0. + static final int LG_RESIZE_FACTOR_BIT = 6; //upper 2 bits in byte 0. Used by Update, Alpha, not used by compact, direct static final int SER_VER_BYTE = 1; - static final int FAMILY_BYTE = 2; //SerVer1,2 was SKETCH_TYPE_BYTE + static final int FAMILY_BYTE = 2; static final int LG_NOM_LONGS_BYTE = 3; //not used by compact static final int LG_ARR_LONGS_BYTE = 4; //not used by compact static final int FLAGS_BYTE = 5; @@ -203,28 +203,23 @@ private PreambleUtil() {} static final int THETA_LONG = 16; //8-byte aligned static final int UNION_THETA_LONG = 24; //8-byte aligned, only used by Union - // flag bit masks - static final int RESERVED_FLAG_MASK = 1; //SerVer 1, 2, 3. Now Reserved, no longer used. - static final int READ_ONLY_FLAG_MASK = 2; //Set but not read. Reserved. SerVer 1, 2, 3 - static final int EMPTY_FLAG_MASK = 4; //SerVer 2, 3 - static final int COMPACT_FLAG_MASK = 8; //SerVer 2 was NO_REBUILD_FLAG_MASK, 3 - static final int ORDERED_FLAG_MASK = 16;//SerVer 2 was UNORDERED_FLAG_MASK, 3 - static final int SINGLEITEM_FLAG_MASK = 32;//SerVer 3 - //The last 2 bits of the flags byte are reserved and assumed to be zero, for now. - - //Backward compatibility: SerVer1 preamble always 3 longs, SerVer2 preamble: 1, 2, 3 longs - // SKETCH_TYPE_BYTE 2 //SerVer1, SerVer2 - // V1, V2 types: Alpha = 1, QuickSelect = 2, SetSketch = 3; V3 only: Buffered QS = 4 - static final int LG_RESIZE_RATIO_BYTE_V1 = 5; //used by SerVer 1 - static final int FLAGS_BYTE_V1 = 6; //used by SerVer 1 + // flag byte bit masks + static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. + static final int READ_ONLY_FLAG_MASK = 2; //Bit 1: Reserved, Set but not read. + static final int EMPTY_FLAG_MASK = 4; //Bit 2: + static final int COMPACT_FLAG_MASK = 8; //Bit 3: + static final int ORDERED_FLAG_MASK = 16;//Bit 4: + static final int SINGLEITEM_FLAG_MASK = 32;//Bit 5: + //The last 2 bits (Bit 6,7) of the flags byte are reserved and assumed to be zero. //Other constants static final int SER_VER = 3; + static final int SER_VER_COMPRESSED = 4; // serial version 4 compressed ordered sketch, not empty, not single item - static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes - static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries - static final int THETA_LONG_V4 = 8; //8-byte aligned + static final int ENTRY_BITS_BYTE_V4 = 3; // number of bits packed in deltas between hashes + static final int NUM_ENTRIES_BYTES_BYTE_V4 = 4; // number of bytes used for the number of entries + static final int THETA_LONG_V4 = 8; //8-byte aligned /** * Computes the number of bytes required for an updatable sketch using a hash-table cache. @@ -377,17 +372,13 @@ else if (preLongs == 3) { //@formatter:on static int extractPreLongs(final MemorySegment seg) { - return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //for SerVer 1,2,3 + return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; } static int extractLgResizeFactor(final MemorySegment seg) { return seg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) >>> LG_RESIZE_FACTOR_BIT & 0X3; } - static int extractLgResizeRatioV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, LG_RESIZE_RATIO_BYTE_V1) & 0X3; - } - static int extractSerVer(final MemorySegment seg) { return seg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; } @@ -408,10 +399,6 @@ static int extractFlags(final MemorySegment seg) { return seg.get(JAVA_BYTE, FLAGS_BYTE) & 0XFF; } - static int extractFlagsV1(final MemorySegment seg) { - return seg.get(JAVA_BYTE, FLAGS_BYTE_V1) & 0XFF; - } - static int extractSeedHash(final MemorySegment seg) { return seg.get(JAVA_SHORT_UNALIGNED, SEED_HASH_SHORT) & 0XFFFF; } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index bc944478d..1e9c65aa0 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -78,12 +78,7 @@ public abstract class Sketch implements MemorySegmentStatus { * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg) { -// return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); - final int familyID = extractFamilyID(srcSeg); - if (familyID == Family.COMPACT.getID()) { - return CompactSketch.heapify(srcSeg);//, Util.DEFAULT_UPDATE_SEED); - } - return heapifyUpdateSketchFromMemorySegment(srcSeg, Util.DEFAULT_UPDATE_SEED); + return heapify(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -97,8 +92,6 @@ public static Sketch heapify(final MemorySegment srcSeg) { *For Compact Sketches this method assumes that the sketch image was created with the * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.
* - *Note: This assumes only SerVer 3 and later.
- * * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. @@ -119,7 +112,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *
Only "Direct" sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch * where all data will be copied to the heap. These early versions were never designed to "wrap".
@@ -128,34 +121,15 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space. * - *For Update Sketches this method checks if the + *
This method checks if the * Default Update Seed
- * was used to create the source MemorySegment image. - * - *For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked.
+ * was used to create the source MemorySegment image. * * @param srcSeg a MemorySegment with an image of a Sketch. * @return a read-only Sketch backed by the given MemorySegment */ public static Sketch wrap(final MemorySegment srcSeg) { - final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; - final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; - final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; - final Family family = Family.idToFamily(familyID); - if (family == Family.QUICKSELECT) { - if (serVer == 3 && preLongs == 3) { - return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, Util.DEFAULT_UPDATE_SEED); - } else { - throw new SketchesArgumentException( - "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3"); - } - } - if (family == Family.COMPACT) { - return CompactSketch.wrap(srcSeg); - } - throw new SketchesArgumentException( - "Cannot wrap family: " + family + " as a Sketch"); + return wrap(srcSeg, Util.DEFAULT_UPDATE_SEED); } /** @@ -163,7 +137,7 @@ public static Sketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have + *
Only "Direct" sketches that have * been explicitly stored as direct sketches can be wrapped. * Wrapping earlier serial version sketches will result in a on-heap CompactSketch * where all data will be copied to the heap. These early versions were never designed to "wrap".
@@ -172,12 +146,8 @@ public static Sketch wrap(final MemorySegment srcSeg) { * result in on-heap equivalent forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space. * - *For Update Sketches this method checks if the - * Default Update Seed
- * was used to create the source MemorySegment image. - * - *For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 (pre-open-source) sketches cannot be checked.
+ *This method checks if the given expectedSeed + * was used to create the source MemorySegment image.
* * @param srcSeg a MemorySegment with an image of a Sketch. * @param expectedSeed the seed used to validate the given MemorySegment image. @@ -388,14 +358,14 @@ public static int getRetainedEntries(final MemorySegment srcSeg) { } return entries; } - //SerVer 2 or 3 + final int preLongs = Sketch.getPreambleLongs(srcSeg); - final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2 & 3 + final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; if (preLongs == 1) { return empty ? 0 : 1; } //preLongs > 1 - return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); //for SerVer 1,2,3 + return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } /** @@ -658,16 +628,16 @@ static boolean getEmpty(final MemorySegment srcSeg) { if (serVer == 1) { return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0; } - return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; //for SerVer 2,3,4 + return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0; } static int getPreambleLongs(final MemorySegment srcSeg) { - return getAndCheckPreLongs(srcSeg); //for SerVer 1,2,3,4 + return getAndCheckPreLongs(srcSeg); } static long getThetaLong(final MemorySegment srcSeg) { final int preLongs = Sketch.getPreambleLongs(srcSeg); - return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); //for SerVer 1,2,3,4 + return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG); } /** @@ -725,7 +695,7 @@ static final double upperBound(final int curCount, final long thetaLong, final i } /** - * Instantiates a Heap Update Sketch from MemorySegment. Only SerVer3. SerVer 1 & 2 already handled. + * Instantiates a Heap Update Sketch from MemorySegment. * @param srcSeg the source MemorySegment * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. diff --git a/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer1and2Test.java b/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer3Test.java similarity index 51% rename from src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer1and2Test.java rename to src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer3Test.java index 68e1d04c6..b1dba552c 100644 --- a/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer1and2Test.java +++ b/src/test/java/org/apache/datasketches/theta/HeapifyWrapSerVer3Test.java @@ -29,7 +29,7 @@ import org.testng.annotations.Test; @SuppressWarnings("resource") -public class HeapifyWrapSerVer1and2Test { +public class HeapifyWrapSerVer3Test { private static final short defaultSeedHash = Util.computeSeedHash(Util.DEFAULT_UPDATE_SEED); //Heapify CompactSketch @@ -46,50 +46,13 @@ public void checkHeapifyCompactSketchAssumedDefaultSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg); //don't check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - } - - @Test //Compact Assumed Different Seed - public void checkHeapifyCompactSketchAssumedDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg); //don't check seedHash here - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here - } - - @Test //Compact Given Default Seed - public void checkHeapifyCompactSketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here + cskResult = CompactSketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); //check seedHash here + assertEquals(cskResult.getSeedHash(), seedHash); } - @Test //Compact Given Different Seed - public void checkHeapifyCompactSketchGivenDifferentSeed() { + @Test + public void checkHeapifyCompactSketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); @@ -100,8 +63,7 @@ public void checkHeapifyCompactSketchGivenDifferentSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test - cskResult = CompactSketch.heapify(cskSeg, seed); //check seedHash here + cskResult = CompactSketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); } @@ -120,14 +82,13 @@ public void checkHeapifySketchAssumedDefaultSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test cskResult = (CompactSketch) Sketch.heapify(cskSeg); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); } @Test - public void checkHeapifySketchAssumedDifferentSeed() { + public void checkHeapifySketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); @@ -138,43 +99,6 @@ public void checkHeapifySketchAssumedDifferentSeed() { final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); CompactSketch cskResult; - //SerialVersion3 test - cskResult = (CompactSketch) Sketch.heapify(cskSeg); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - } - - @Test - public void checkHeapifySketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test - cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - } - - @Test - public void checkHeapifySketchGivenDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - - final CompactSketch csk = usk.compact(); - final MemorySegment cskSeg = MemorySegment.ofArray(csk.toByteArray()).asReadOnly(); - CompactSketch cskResult; - - //SerialVersion3 test cskResult = (CompactSketch) Sketch.heapify(cskSeg, seed); assertEquals(cskResult.getEstimate(), usk.getEstimate()); assertEquals(cskResult.getSeedHash(), seedHash); @@ -193,28 +117,6 @@ public void checkWrapCompactSketchAssumedDefaultSeed() { MemorySegment offHeap; final CompactSketch csk = usk.compact(); - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = CompactSketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapCompactSketchAssumedDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); cskResult = CompactSketch.wrap(offHeap); @@ -225,28 +127,7 @@ public void checkWrapCompactSketchAssumedDifferentSeed() { } @Test - public void checkWrapCompactSketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = CompactSketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapCompactSketchGivenDifferentSeed() { + public void checkWrapCompactSketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); @@ -256,7 +137,6 @@ public void checkWrapCompactSketchGivenDifferentSeed() { MemorySegment offHeap; final CompactSketch csk = usk.compact(); - //SerialVersion3 test try(Arena arena = Arena.ofConfined()) { offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); cskResult = CompactSketch.wrap(offHeap, seed); @@ -290,49 +170,7 @@ public void checkWrapSketchAssumedDefaultSeed() { } @Test - public void checkWrapSketchAssumedDifferentSeed() { - final int k = 64; - final long seed = 128L; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapSketchGivenDefaultSeed() { - final int k = 64; - final long seed = Util.DEFAULT_UPDATE_SEED; - final short seedHash = Util.computeSeedHash(seed); - final UpdateSketch usk = UpdateSketch.builder().setNominalEntries(k).setSeed(seed).build(); - for (int i = 0; i < k; i++) { usk.update(i); } - CompactSketch cskResult; - MemorySegment offHeap; - final CompactSketch csk = usk.compact(); - - //SerialVersion3 test - try(Arena arena = Arena.ofConfined()) { - offHeap = putOffHeap(MemorySegment.ofArray(csk.toByteArray()), arena); - cskResult = (CompactSketch) Sketch.wrap(offHeap, seed); - assertEquals(cskResult.getEstimate(), usk.getEstimate()); - assertEquals(cskResult.getSeedHash(), seedHash); - assertTrue(cskResult.isOffHeap()); - } - } - - @Test - public void checkWrapSketchGivenDifferentSeed() { + public void checkWrapSketchDifferentSeed() { final int k = 64; final long seed = 128L; final short seedHash = Util.computeSeedHash(seed); diff --git a/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java b/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java index f88b39185..61093c2a5 100644 --- a/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java +++ b/src/test/java/org/apache/datasketches/theta/PreambleUtilTest.java @@ -23,11 +23,9 @@ import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; -import static org.apache.datasketches.theta.PreambleUtil.extractFlagsV1; import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor; -import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeRatioV1; import static org.apache.datasketches.theta.PreambleUtil.extractP; import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; @@ -59,12 +57,6 @@ import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; -import org.apache.datasketches.theta.CompactSketch; -import org.apache.datasketches.theta.PreambleUtil; -import org.apache.datasketches.theta.SetOperation; -import org.apache.datasketches.theta.Sketch; -import org.apache.datasketches.theta.Union; -import org.apache.datasketches.theta.UpdateSketch; import org.testng.Assert; import org.testng.annotations.Test; @@ -191,12 +183,10 @@ public void checkInsertsAndExtracts() { insertFlags(wseg, 3); assertEquals(extractFlags(wseg), 3); - assertEquals(extractLgResizeRatioV1(wseg), 3); //also at byte 5, limited to 2 bits insertFlags(wseg, 0); insertSeedHash(wseg, ++v); assertEquals(extractSeedHash(wseg), v); - assertEquals(extractFlagsV1(wseg), v); //also at byte 6 insertSeedHash(wseg, 0); insertCurCount(wseg, ++v); From 5c3b84581ca9edbed5091315c739b9151bc1a34d Mon Sep 17 00:00:00 2001 From: Lee RhodesThis method checks if the - * Default Update Seed
+ * Default Update Seed * was used to create the source MemorySegment image. * * @param srcSeg a MemorySegment with an image of a Sketch. From 8c66f20720f1657b9188a6b969ccccb1e1af1af1 Mon Sep 17 00:00:00 2001 From: Lee RhodesOnly "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image. - * Note that SerialVersion 1 (pre-open-source) sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of DEFAULT_UPDATE_SEED.
+ *This method checks if the DEFAULT_UPDATE_SEED was used to create the source byte array image.
* * @param bytes a byte array image of a Sketch that was created using the DEFAULT_UPDATE_SEED. * * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes) { - return wrap(bytes, Util.DEFAULT_UPDATE_SEED, false); + return wrap(bytes, Util.DEFAULT_UPDATE_SEED); } /** @@ -216,18 +209,13 @@ public static CompactSketch wrap(final byte[] bytes) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" Serialization Versions 3 and 4 (i.e, OpenSource) sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a heapify operation. - * These early versions were never designed to "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will * result in heapified forms of empty and single item sketch respectively. * This is actually faster and consumes less overall space.
* - *This method checks if the given expectedSeed was used to create the source byte array image. - * Note that SerialVersion 1 sketches cannot be checked as they don't have a seedHash field, - * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.
+ *This method checks if the given expectedSeed was used to create the source byte array image.
* * @param bytes a byte array image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given byte array image. @@ -235,18 +223,13 @@ public static CompactSketch wrap(final byte[] bytes) { * @return a CompactSketch backed by the given byte array except as above. */ public static CompactSketch wrap(final byte[] bytes, final long expectedSeed) { - return wrap(bytes, expectedSeed, true); - } - - private static CompactSketch wrap(final byte[] bytes, final long seed, final boolean enforceSeed) { final int serVer = bytes[PreambleUtil.SER_VER_BYTE]; final int familyId = bytes[PreambleUtil.FAMILY_BYTE]; final Family family = Family.idToFamily(familyId); if (family != Family.COMPACT) { throw new SketchesArgumentException("Corrupted: " + family + " is not Compact!"); } - final short seedHash = Util.computeSeedHash(seed); - + final short seedHash = Util.computeSeedHash(expectedSeed); if (serVer == 3) { final int flags = bytes[FLAGS_BYTE]; @@ -254,8 +237,8 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo return EmptyCompactSketch.getHeapInstance(MemorySegment.ofArray(bytes)); } final int preLongs = bytes[PREAMBLE_LONGS_BYTE]; - if (otherCheckForSingleItem(preLongs, serVer, familyId, flags)) { - return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + if (checkForSingleItem(preLongs, serVer, familyId, flags)) { + return SingleItemSketch.heapify(MemorySegment.ofArray(bytes), seedHash); } //not empty & not singleItem final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0; @@ -268,8 +251,7 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo throw new SketchesArgumentException( "Corrupted: COMPACT family sketch image must have Read-Only flag set"); } - return WrappedCompactSketch.wrapInstance(bytes, - enforceSeed ? seedHash : getShortLE(bytes, SEED_HASH_SHORT)); + return WrappedCompactSketch.wrapInstance(bytes, seedHash); } if (serVer ==4) { return WrappedCompactCompressedSketch.wrapInstance(bytes, seedHash); @@ -277,8 +259,6 @@ private static CompactSketch wrap(final byte[] bytes, final long seed, final boo //not SerVer 3 or 4 throw new SketchesArgumentException( "Corrupted: Serialization Version " + serVer + " not recognized."); - - } //Sketch Overrides diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java index 55d7aa31e..4a3b80839 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactCompressedSketch.java @@ -84,7 +84,7 @@ public int getCurrentBytes() { private static final int START_PACKED_DATA_ESTIMATION_MODE = 16; @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch // number of entries is stored using variable length encoding // most significant bytes with all zeros are not stored // one byte in the preamble has the number of non-zero bytes used diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java index b289a1dbf..f393dc5b8 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java @@ -28,7 +28,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem; +import static org.apache.datasketches.theta.SingleItemSketch.checkForSingleItem; import java.lang.foreign.MemorySegment; @@ -80,15 +80,15 @@ public CompactSketch compact(final boolean dstOrdered, final MemorySegment dstSe @Override public int getCurrentBytes() { - if (otherCheckForSingleItem(seg_)) { return 16; } + if (checkForSingleItem(seg_)) { return 16; } final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); return (preLongs + curCount) << 3; } @Override - public int getRetainedEntries(final boolean valid) { //compact is always valid - if (otherCheckForSingleItem(seg_)) { return 1; } + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch + if (checkForSingleItem(seg_)) { return 1; } final int preLongs = Sketch.getPreambleLongs(seg_); return (preLongs == 1) ? 0 : extractCurCount(seg_); } @@ -146,7 +146,7 @@ public byte[] toByteArray() { @Override long[] getCache() { - if (otherCheckForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } + if (checkForSingleItem(seg_)) { return new long[] { seg_.get(JAVA_LONG_UNALIGNED, 8) }; } final int preLongs = Sketch.getPreambleLongs(seg_); final int curCount = (preLongs == 1) ? 0 : extractCurCount(seg_); if (curCount > 0) { diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 3480ac2ea..723b6cc75 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -179,7 +179,7 @@ static DirectQuickSelectSketch writableWrap( final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); if (isResizeFactorIncorrect(srcSeg, lgNomLongs, lgArrLongs)) { diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index e3f7197cc..f78fbced4 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -28,7 +28,6 @@ import static org.apache.datasketches.theta.CompactOperations.correctThetaOnCompact; import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_ARR_LONGS_BYTE; -//import static org.apache.datasketches.theta.PreambleUtil.LG_NOM_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.LG_RESIZE_FACTOR_BIT; import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; import static org.apache.datasketches.theta.PreambleUtil.P_FLOAT; @@ -38,6 +37,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs; import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong; import java.lang.foreign.MemorySegment; @@ -102,12 +102,11 @@ private DirectQuickSelectSketchR(final long seed, final MemorySegment srcSeg) { * @return instance of this sketch */ static DirectQuickSelectSketchR readOnlyWrap(final MemorySegment srcSeg, final long seed) { - final int preambleLongs = Sketch.getPreambleLongs(srcSeg); //byte 0 + final int preambleLongs = checkSegPreambleCap(srcSeg); //byte 0 final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - - UpdateSketch.checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); + UpdateSketch.checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); return new DirectQuickSelectSketchR(seed, srcSeg); } @@ -147,7 +146,7 @@ public Family getFamily() { } @Override - public int getRetainedEntries(final boolean valid) { //always valid for theta + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return wseg_.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT); } diff --git a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java index 45a17d40d..793ce1763 100644 --- a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java @@ -86,7 +86,7 @@ public int getCurrentBytes() { public double getEstimate() { return 0; } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 0; } diff --git a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java index 6aeb09401..5a5c16f00 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java @@ -208,7 +208,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch if (curCount_ > 0) { if (valid && isDirty()) { return HashOperations.countPart(getCache(), getLgArrLongs(), getThetaLong()); diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java index 50c419e61..69eebff5f 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java @@ -88,7 +88,7 @@ public int getCurrentBytes() { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return curCount_; } diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java index b51273404..c23deebf1 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java @@ -111,7 +111,7 @@ static HeapQuickSelectSketch heapifyInstance(final MemorySegment srcSeg, final l final int lgNomLongs = extractLgNomLongs(srcSeg); //byte 3 final int lgArrLongs = extractLgArrLongs(srcSeg); //byte 4 - checkUnionQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); + checkUnionAndQuickSelectFamily(srcSeg, preambleLongs, lgNomLongs); checkSegIntegrity(srcSeg, seed, preambleLongs, lgNomLongs, lgArrLongs); final float p = extractP(srcSeg); //bytes 12-15 @@ -149,7 +149,7 @@ public Family getFamily() { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return curCount_; } diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java index a95ebaaf6..ff35dfdaf 100644 --- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java +++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java @@ -204,7 +204,7 @@ private PreambleUtil() {} static final int UNION_THETA_LONG = 24; //8-byte aligned, only used by Union // flag byte bit masks - static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. + static final int RESERVED_FLAG_MASK = 1; //Bit 0: Reserved, no longer used. Was BigEndian static final int READ_ONLY_FLAG_MASK = 2; //Bit 1: Reserved, Set but not read. static final int EMPTY_FLAG_MASK = 4; //Bit 2: static final int COMPACT_FLAG_MASK = 8; //Bit 3: @@ -256,7 +256,7 @@ static String preambleToString(final byte[] byteArr) { * @return the summary preamble string. */ static String preambleToString(final MemorySegment seg) { - final int preLongs = getAndCheckPreLongs(seg); + final int preLongs = checkSegPreambleCap(seg); final int rfId = extractLgResizeFactor(seg); final ResizeFactor rf = ResizeFactor.getRF(rfId); final int serVer = extractSerVer(seg); @@ -515,17 +515,16 @@ static boolean isEmptyFlag(final MemorySegment seg) { * @param seg the given MemorySegment * @return the extracted prelongs value. */ - static int getAndCheckPreLongs(final MemorySegment seg) { - final long cap = seg.byteSize(); - if (cap < 8) { - throwNotBigEnough(cap, 8); + static int checkSegPreambleCap(final MemorySegment seg) { + try { + final int preLongs = extractPreLongs(seg); + final int required = Math.max(preLongs << 3, 8); + final long cap = seg.byteSize(); + if (cap < required) { throwNotBigEnough(cap, required); } + return preLongs; + } catch (IndexOutOfBoundsException e) { //thrown by MemorySegment + throw new SketchesArgumentException("Possible Corruption: Given MemorySegment is empty."); } - final int preLongs = extractPreLongs(seg); - final int required = Math.max(preLongs << 3, 8); - if (cap < required) { - throwNotBigEnough(cap, required); - } - return preLongs; } static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { @@ -534,10 +533,10 @@ static short checkSegmentSeedHash(final MemorySegment seg, final long seed) { return seedHashSeg; } - private static void throwNotBigEnough(final long cap, final int required) { + private static void throwNotBigEnough(final long cap, final long required) { throw new SketchesArgumentException( - "Possible Corruption: Size of byte array or MemorySegment not large enough: Size: " + cap - + ", Required: " + required); + "Possible Corruption: Size of MemorySegment not large enough: Size: " + cap + + " < Required: " + required); } static int wholeBytesToHoldBits(final int bits) { diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java index 062c6d86d..766e1850d 100644 --- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java +++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java @@ -24,6 +24,7 @@ import static org.apache.datasketches.common.ByteArrayUtil.putLongLE; import static org.apache.datasketches.hash.MurmurHash3.hash; import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; import static org.apache.datasketches.theta.PreambleUtil.extractFlags; import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash; @@ -44,13 +45,13 @@ final class SingleItemSketch extends CompactSketch { private static final long DEFAULT_SEED_HASH = Util.computeSeedHash(Util.DEFAULT_UPDATE_SEED) & 0xFFFFL; // For backward compatibility, a candidate pre0_ long must have: - // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, LittleEndian = 11010 = 0x1A. + // Flags (byte 5): Ordered, Compact, NOT Empty, Read Only, NOT BigEndian = 11010 = 0x1A. (without SI flag) // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches, it is virtually guaranteed that we have a SingleItem Sketch. - private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //with SI flag + private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //low 6 bytes, with SI flag private long pre0_ = 0; private long hash_ = 0; @@ -83,7 +84,7 @@ private SingleItemSketch(final long hash) { */ //does not override Sketch static SingleItemSketch heapify(final MemorySegment srcSeg, final short expectedSeedHash) { Util.checkSeedHashes((short) extractSeedHash(srcSeg), expectedSeedHash); - final boolean singleItem = otherCheckForSingleItem(srcSeg); + final boolean singleItem = checkForSingleItem(srcSeg); if (singleItem) { return new SingleItemSketch(srcSeg.get(JAVA_LONG_UNALIGNED, 8), expectedSeedHash); } throw new SketchesArgumentException("Input MemorySegment is not a SingleItemSketch."); } @@ -329,7 +330,7 @@ public double getLowerBound(final int numStdDev) { } @Override - public int getRetainedEntries(final boolean valid) { + public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch return 1; } @@ -383,25 +384,28 @@ short getSeedHash() { return (short) (pre0_ >>> 48); } - static boolean otherCheckForSingleItem(final MemorySegment seg) { - return otherCheckForSingleItem(Sketch.getPreambleLongs(seg), extractSerVer(seg), - extractFamilyID(seg), extractFlags(seg) ); + static boolean checkForSingleItem(final MemorySegment seg) { + final int preLongs = checkSegPreambleCap(seg); + return checkForSingleItem(preLongs, extractSerVer(seg), extractFamilyID(seg), extractFlags(seg) ); } - static boolean otherCheckForSingleItem(final int preLongs, final int serVer, - final int famId, final int flags) { - // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved=F = X11010 = 0x1A. + static boolean checkForSingleItem( + final int preLongs, + final int serVer, + final int famId, + final int flags) { + // Flags byte: SI=X, Ordered=T, Compact=T, Empty=F, ReadOnly=T, Reserved(BE)=F = X11010 = 0x1A. // Flags mask will be 0x1F. // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now. // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3, // and the hash seed matches (not done here), it is virtually guaranteed that we have a // SingleItem Sketch. - final boolean numPreLongs = preLongs == 1; - final boolean numSerVer = serVer >= 3; - final boolean numFamId = famId == Family.COMPACT.getID(); - final boolean numFlags = (flags & 0x1F) == 0x1A; //no SI, yet - final boolean singleFlag = (flags & SINGLEITEM_FLAG_MASK) > 0; - return (numPreLongs && numSerVer && numFamId && numFlags) || singleFlag; + final boolean preLongsOK = preLongs == 1; + final boolean serVerOK = serVer >= 3; + final boolean famIdOK = famId == Family.COMPACT.getID(); + final boolean flagsOK = (flags & 0x1F) == 0x1A; //no SI, yet + final boolean singleFlagOK = (flags & SINGLEITEM_FLAG_MASK) > 0; + return (preLongsOK && serVerOK && famIdOK && flagsOK) || singleFlagOK; } } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 33593e430..d14519062 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -19,9 +19,6 @@ package org.apache.datasketches.theta; -import static java.lang.foreign.ValueLayout.JAVA_BYTE; -import static java.lang.foreign.ValueLayout.JAVA_INT_UNALIGNED; -import static java.lang.foreign.ValueLayout.JAVA_LONG_UNALIGNED; import static org.apache.datasketches.common.Family.idToFamily; import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE; import static org.apache.datasketches.common.Util.LS; @@ -29,15 +26,12 @@ import static org.apache.datasketches.common.Util.zeroPad; import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK; import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK; -import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT; -import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE; -import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG; +import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap; +import static org.apache.datasketches.theta.PreambleUtil.extractCurCount; import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID; +import static org.apache.datasketches.theta.PreambleUtil.extractFlags; +import static org.apache.datasketches.theta.PreambleUtil.extractSerVer; import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong; -import static org.apache.datasketches.theta.PreambleUtil.getAndCheckPreLongs; import static org.apache.datasketches.thetacommon.HashOperations.count; import java.lang.foreign.MemorySegment; @@ -70,9 +64,6 @@ public abstract class Sketch implements MemorySegmentStatus { * Default Update Seed * was used to create the source MemorySegment image. * - *For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.
- * * @param srcSeg an image of a Sketch. * * @return a Sketch on the heap. @@ -89,9 +80,6 @@ public static Sketch heapify(final MemorySegment srcSeg) { *For Update Sketches this method checks if the expectedSeed * was used to create the source MemorySegment image.
* - *For Compact Sketches this method assumes that the sketch image was created with the - * correct hash seed, so it is not checked. SerialVersion 1 sketches (pre-open-source) cannot be checked.
- * * @param srcSeg an image of a Sketch that was created using the given expectedSeed. * @param expectedSeed the seed used to validate the given MemorySegment image. * See Update Hash Seed. @@ -99,9 +87,9 @@ public static Sketch heapify(final MemorySegment srcSeg) { * @return a Sketch on the heap. */ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed) { - final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE); - final Family family = idToFamily(familyID); - if (family == Family.COMPACT) { + checkSegPreambleCap(srcSeg); + final int familyID = extractFamilyID(srcSeg); + if (familyID == Family.COMPACT.getID()) { return CompactSketch.heapify(srcSeg, expectedSeed); } return heapifyUpdateSketchFromMemorySegment(srcSeg, expectedSeed); @@ -112,10 +100,7 @@ public static Sketch heapify(final MemorySegment srcSeg, final long expectedSeed * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *Only "Direct" sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will * result in on-heap equivalent forms of empty and single item sketch respectively. @@ -137,10 +122,7 @@ public static Sketch wrap(final MemorySegment srcSeg) { * There is no data copying onto the java heap. * The wrap operation enables fast read-only merging and access to all the public read-only API. * - *
Only "Direct" sketches that have - * been explicitly stored as direct sketches can be wrapped. - * Wrapping earlier serial version sketches will result in a on-heap CompactSketch - * where all data will be copied to the heap. These early versions were never designed to "wrap".
+ *Only sketches that have been explicitly stored as direct sketches can be wrapped.
* *Wrapping any subclass of this class that is empty or contains only a single item will
* result in on-heap equivalent forms of empty and single item sketch respectively.
@@ -155,21 +137,15 @@ public static Sketch wrap(final MemorySegment srcSeg) {
* @return a read-only Sketch backed by the given MemorySegment.
*/
public static Sketch wrap(final MemorySegment srcSeg, final long expectedSeed) {
- final int preLongs = srcSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F;
- final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF;
- final int familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF;
- final Family family = Family.idToFamily(familyID);
- if (family == Family.QUICKSELECT) {
- if (serVer == 3 && preLongs == 3) {
- return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed);
- } else {
- throw new SketchesArgumentException(
- "Corrupted: " + family + " family image: must have SerVer = 3 and preLongs = 3");
- }
+ checkSegPreambleCap(srcSeg);
+ final int familyID = extractFamilyID(srcSeg);
+ if (familyID == Family.QUICKSELECT.getID()) {
+ return DirectQuickSelectSketchR.readOnlyWrap(srcSeg, expectedSeed);
}
- if (family == Family.COMPACT) {
+ if (familyID == Family.COMPACT.getID()) {
return CompactSketch.wrap(srcSeg, expectedSeed);
}
+ final Family family = Family.idToFamily(familyID);
throw new SketchesArgumentException(
"Cannot wrap family: " + family + " as a Sketch");
}
@@ -260,10 +236,11 @@ public int getCountLessThanThetaLong(final long thetaLong) {
* @return the result estimate
*/
public static double getEstimate(final MemorySegment srcSeg) {
- final int famId = extractFamilyID(srcSeg);
- if (!isValidSketchID(famId)) {
- throw new SketchesArgumentException("Source MemorySegment not a valid Sketch. Family: "
- + Family.idToFamily(famId).toString());
+ checkSegPreambleCap(srcSeg);
+ final int familyId = extractFamilyID(srcSeg);
+ if (!isValidSketchID(familyId)) {
+ throw new SketchesArgumentException("Source MemorySegment not a valid Sketch Family: "
+ + Family.idToFamily(familyId).toString());
}
return Sketch.estimate(extractThetaLong(srcSeg), getRetainedEntries(srcSeg));
}
@@ -338,53 +315,42 @@ public static int getUpdateSketchMaxBytes(final int lgNomEntries) {
/**
* Returns the number of valid entries that have been retained by the sketch.
- * @return the number of valid retained entries
+ * For the Alpha Sketch this returns only valid entries.
+ * @return the number of valid retained entries.
*/
public int getRetainedEntries() {
return getRetainedEntries(true);
}
+ /**
+ * Returns the number of entries that have been retained by the sketch.
+ * @param valid This parameter is only relevant for the Alpha Sketch.
+ * if true, returns the number of valid entries, which are less than theta and used
+ * for estimation. Otherwise, return the number of all entries, valid or not, that are currently in the
+ * internal sketch cache.
+ * @return the number of retained entries
+ */
+ public abstract int getRetainedEntries(final boolean valid);
+
/**
* Returns the number of valid entries that have been retained by the sketch from the given MemorySegment
* @param srcSeg the given MemorySegment that has an image of a Sketch
* @return the number of valid retained entries
*/
public static int getRetainedEntries(final MemorySegment srcSeg) {
- final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE);
- if (serVer == 1) {
- final int entries = srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT);
- if (Sketch.getThetaLong(srcSeg) == Long.MAX_VALUE && entries == 0) {
- return 0;
- }
- return entries;
- }
-
- final int preLongs = Sketch.getPreambleLongs(srcSeg);
- final boolean empty = (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0;
- if (preLongs == 1) {
- return empty ? 0 : 1;
- }
- //preLongs > 1
- return srcSeg.get(JAVA_INT_UNALIGNED, RETAINED_ENTRIES_INT);
+ final int preLongs = checkSegPreambleCap(srcSeg);
+ final boolean empty = (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0;
+ return (preLongs == 1) ? (empty ? 0 : 1) : extractCurCount(srcSeg);
}
- /**
- * Returns the number of entries that have been retained by the sketch.
- * @param valid if true, returns the number of valid entries, which are less than theta and used
- * for estimation.
- * Otherwise, return the number of all entries, valid or not, that are currently in the internal
- * sketch cache.
- * @return the number of retained entries
- */
- public abstract int getRetainedEntries(boolean valid);
-
/**
* Returns the serialization version from the given MemorySegment
* @param seg the sketch MemorySegment
* @return the serialization version from the MemorySegment
*/
public static int getSerializationVersion(final MemorySegment seg) {
- return seg.get(JAVA_BYTE, SER_VER_BYTE);
+ checkSegPreambleCap(seg);
+ return extractSerVer(seg);
}
/**
@@ -624,20 +590,21 @@ public static String toString(final MemorySegment seg) {
abstract short getSeedHash();
static boolean getEmpty(final MemorySegment srcSeg) {
- final int serVer = srcSeg.get(JAVA_BYTE, SER_VER_BYTE);
+ checkSegPreambleCap(srcSeg);
+ final int serVer = extractSerVer(srcSeg);
if (serVer == 1) {
return getThetaLong(srcSeg) == Long.MAX_VALUE && getRetainedEntries(srcSeg) == 0;
}
- return (srcSeg.get(JAVA_BYTE, FLAGS_BYTE) & EMPTY_FLAG_MASK) != 0;
+ return (extractFlags(srcSeg) & EMPTY_FLAG_MASK) != 0;
}
static int getPreambleLongs(final MemorySegment srcSeg) {
- return getAndCheckPreLongs(srcSeg);
+ return checkSegPreambleCap(srcSeg);
}
static long getThetaLong(final MemorySegment srcSeg) {
- final int preLongs = Sketch.getPreambleLongs(srcSeg);
- return preLongs < 3 ? Long.MAX_VALUE : srcSeg.get(JAVA_LONG_UNALIGNED, THETA_LONG);
+ final int preLongs = checkSegPreambleCap(srcSeg);
+ return preLongs < 3 ? Long.MAX_VALUE : extractThetaLong(srcSeg);
}
/**
@@ -702,20 +669,14 @@ static final double upperBound(final int curCount, final long thetaLong, final i
* @return a Sketch
*/
private static final Sketch heapifyUpdateSketchFromMemorySegment(final MemorySegment srcSeg, final long expectedSeed) {
- final long cap = srcSeg.byteSize();
- if (cap < 8) {
- throw new SketchesArgumentException(
- "Corrupted: valid sketch must be at least 8 bytes.");
- }
- final byte familyID = srcSeg.get(JAVA_BYTE, FAMILY_BYTE);
- final Family family = idToFamily(familyID);
+ final Family family = idToFamily(extractFamilyID(srcSeg));
if (family == Family.ALPHA) {
- final int flags = PreambleUtil.extractFlags(srcSeg);
+ final int flags = extractFlags(srcSeg);
final boolean compactFlag = (flags & COMPACT_FLAG_MASK) != 0;
if (compactFlag) {
throw new SketchesArgumentException(
- "Corrupted: ALPHA family image: cannot be compact");
+ "Corrupted: An ALPHA family image cannot be compact");
}
return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed);
}
diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
index 4cd3a4cd4..7eb69eccb 100644
--- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
@@ -19,19 +19,15 @@
package org.apache.datasketches.theta;
-import static java.lang.foreign.ValueLayout.JAVA_BYTE;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE;
-import static org.apache.datasketches.common.Util.checkBounds;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.apache.datasketches.theta.CompactOperations.componentsToCompact;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
-import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
+import static org.apache.datasketches.theta.PreambleUtil.checkSegPreambleCap;
import static org.apache.datasketches.theta.PreambleUtil.checkSegmentSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
@@ -106,12 +102,11 @@ public static UpdateSketch wrap(
final MemorySegmentRequest mSegReq,
final long expectedSeed) {
Objects.requireNonNull(srcWSeg, "Source MemorySegment must not be null");
- checkBounds(0, 24, srcWSeg.byteSize()); //need min 24 bytes
- final int preLongs = srcWSeg.get(JAVA_BYTE, PREAMBLE_LONGS_BYTE) & 0X3F; //mask to 6 bits
- final int serVer = srcWSeg.get(JAVA_BYTE, SER_VER_BYTE) & 0XFF; //mask to byte
- final int familyID = srcWSeg.get(JAVA_BYTE, FAMILY_BYTE) & 0XFF; //mask to byte
- final Family family = Family.idToFamily(familyID);
- if (family != Family.QUICKSELECT) {
+ final int preLongs = checkSegPreambleCap(srcWSeg) & 0X3F; //mask to 6 bits;
+ final int serVer = extractSerVer(srcWSeg);
+ final int familyID = extractFamilyID(srcWSeg);
+ if (familyID != Family.QUICKSELECT.getID()) {
+ final Family family = Family.idToFamily(familyID);
throw new SketchesArgumentException(
"A " + family + " sketch cannot be wrapped as an UpdateSketch.");
}
@@ -150,9 +145,9 @@ public static UpdateSketch heapify(final MemorySegment srcSeg) {
*/
public static UpdateSketch heapify(final MemorySegment srcSeg, final long expectedSeed) {
Objects.requireNonNull(srcSeg, "Source MemorySegment must not be null");
- checkBounds(0, 24, srcSeg.byteSize()); //need min 24 bytes
- final Family family = Family.idToFamily(srcSeg.get(JAVA_BYTE, FAMILY_BYTE));
- if (family.equals(Family.ALPHA)) {
+ checkSegPreambleCap(srcSeg);
+ final int familyID = extractFamilyID(srcSeg);
+ if (familyID == Family.ALPHA.getID()) {
return HeapAlphaSketch.heapifyInstance(srcSeg, expectedSeed);
}
return HeapQuickSelectSketch.heapifyInstance(srcSeg, expectedSeed);
@@ -418,23 +413,23 @@ public UpdateReturnState update(final long[] data) {
*/
abstract boolean isOutOfSpace(int numEntries);
- static void checkUnionQuickSelectFamily(final MemorySegment seg, final int preambleLongs,
- final int lgNomLongs) {
+ static void checkUnionAndQuickSelectFamily(final MemorySegment seg, final int preambleLongs, final int lgNomLongs) {
+
//Check Family
final int familyID = extractFamilyID(seg); //byte 2
- final Family family = Family.idToFamily(familyID);
- if (family.equals(Family.UNION)) {
+ if (familyID == Family.UNION.getID()) {
if (preambleLongs != Family.UNION.getMinPreLongs()) {
throw new SketchesArgumentException(
"Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs);
}
}
- else if (family.equals(Family.QUICKSELECT)) {
+ else if (familyID == Family.QUICKSELECT.getID()) {
if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) {
throw new SketchesArgumentException(
"Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs);
}
} else {
+ final Family family = Family.idToFamily(familyID);
throw new SketchesArgumentException(
"Possible corruption: Invalid Family: " + family.toString());
}
diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java
index c4affc9ce..584338469 100644
--- a/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactCompressedSketch.java
@@ -69,7 +69,7 @@ public int getCurrentBytes() {
private static final int START_PACKED_DATA_ESTIMATION_MODE = 16;
@Override
- public int getRetainedEntries(final boolean valid) { //compact is always valid
+ public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
// number of entries is stored using variable length encoding
// most significant bytes with all zeros are not stored
// one byte in the preamble has the number of non-zero bytes used
diff --git a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java
index 08726a7ff..1f3f3ab9e 100644
--- a/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/WrappedCompactSketch.java
@@ -80,7 +80,7 @@ public int getCurrentBytes() {
}
@Override
- public int getRetainedEntries(final boolean valid) { //compact is always valid
+ public int getRetainedEntries(final boolean valid) { //valid is only relevant for the Alpha Sketch
final int preLongs = bytes_[PREAMBLE_LONGS_BYTE];
return (preLongs == 1) ? 0 : getIntLE(bytes_, RETAINED_ENTRIES_INT);
}
From 72c7a2467c210280e1efc5958b245191e1cb5fd9 Mon Sep 17 00:00:00 2001
From: Lee Rhodes