Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Destination S3 V2: copy over legacy DATs, enable tests #48490

Draft
wants to merge 43 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
1771f4a
Bulk Load CDK: State Tests and Fixes; part no from state
johnny-schmidt Nov 21, 2024
e3460ad
staging to move into state
johnny-schmidt Nov 21, 2024
d652514
objects to delete into state
johnny-schmidt Nov 21, 2024
cd86ec3
staging data not moved until close; loader await bugfix
johnny-schmidt Nov 21, 2024
bf470f6
disable tests; remove dead code
johnny-schmidt Nov 22, 2024
a09c07c
better checkpoint behavior
edgao Nov 22, 2024
ea5febb
maybe fix truncate tests
edgao Nov 22, 2024
e98d80d
reenable tests for s3
edgao Nov 22, 2024
6051b71
remove per-record println
edgao Nov 22, 2024
5856879
comments+cleanup
edgao Nov 22, 2024
77416bd
record differ should actually sort >.>
edgao Nov 22, 2024
1ce4a12
removed filler records; microbatching for tests; staging persists
johnny-schmidt Nov 22, 2024
8cd20df
fix mock dest IT: microbatches, concurrent files
johnny-schmidt Nov 22, 2024
e60ac5e
Bulk Load CDK: Staging falls back to metadata
johnny-schmidt Nov 22, 2024
dca4428
mock IT fixed for real this time?
johnny-schmidt Nov 22, 2024
8c05dc8
Docker tests get batch size limitation
johnny-schmidt Nov 23, 2024
88205b7
Bulk Load CDK: prefix search nonambiguous; filename uses wall time (#…
johnny-schmidt Nov 23, 2024
efa2cc5
version bump
johnny-schmidt Nov 25, 2024
6238141
Bulk Load CDK: prefix search nonambiguous; filename uses wall time
johnny-schmidt Nov 21, 2024
f88da53
copy over stream read constraints
edgao Nov 15, 2024
93b129e
add option for null = unset
edgao Nov 15, 2024
1455454
move container types to containerTypes
edgao Nov 15, 2024
19e6f13
oops
johnny-schmidt Nov 20, 2024
e80dc84
fix nulls in parquet array
edgao Nov 15, 2024
a7763d0
propagate nullability to avro array schema
edgao Nov 15, 2024
f7f83e0
fix avro data dumper
edgao Nov 15, 2024
fdbfc3c
improve float/int handling
edgao Nov 15, 2024
0913c63
improve float/int handling
edgao Nov 15, 2024
64efce0
fixed avro/parquet bigint
johnny-schmidt Nov 17, 2024
c4584c5
csv flattened fixed; all non-disabled tests green
johnny-schmidt Nov 17, 2024
fb27615
enabled remaining resume tests save 1; fixed: part no during truncate…
johnny-schmidt Nov 18, 2024
10a6900
failing unit tests
johnny-schmidt Nov 18, 2024
a6762e3
tweaks
johnny-schmidt Nov 18, 2024
c28a29c
fixed failing unit test
johnny-schmidt Nov 18, 2024
b97d7af
copy over legacy DATs + enable remaining tests
edgao Nov 13, 2024
491f47f
fix image name
edgao Nov 14, 2024
4faadad
json test fixes; s3 special chrs; spec tweak; empty streams; state re…
johnny-schmidt Nov 18, 2024
2556e6f
Fix: global state hangs when one stream is empty
johnny-schmidt Nov 25, 2024
5775fdc
fix: slash between path and file iff explicitly in config
johnny-schmidt Nov 25, 2024
a33eab7
fix: null-tolerant trace.emittedAt
johnny-schmidt Nov 25, 2024
59807a1
fix? treats state without type as global sort of? is this real
johnny-schmidt Nov 25, 2024
114a17d
fix? status type can be null too?
johnny-schmidt Nov 25, 2024
e3f0bb4
and the default compression is gzip apparently?
johnny-schmidt Nov 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ airbyte:
flush:
rate-ms: 900000 # 15 minutes
window-ms: 900000 # 15 minutes
destination:
record-batch-size: ${AIRBYTE_DESTINATION_RECORD_BATCH_SIZE:209715200}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import io.airbyte.cdk.load.test.util.NoopExpectedRecordMapper
import io.airbyte.cdk.load.test.util.NoopNameMapper
import io.airbyte.cdk.load.write.BasicFunctionalityIntegrationTest
import io.airbyte.cdk.load.write.Untyped
import org.junit.jupiter.api.Disabled
import org.junit.jupiter.api.Test

class MockBasicFunctionalityIntegrationTest :
Expand All @@ -34,7 +33,6 @@ class MockBasicFunctionalityIntegrationTest :
}

@Test
@Disabled
override fun testMidSyncCheckpointingStreamState() {
super.testMidSyncCheckpointingStreamState()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ import io.airbyte.cdk.load.test.util.DestinationDataDumper
import io.airbyte.cdk.load.test.util.OutputRecord
import io.airbyte.cdk.load.test.util.RecordDiffer
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ConcurrentLinkedQueue

object MockDestinationBackend {
private val files: MutableMap<String, MutableList<OutputRecord>> = ConcurrentHashMap()
private val files: ConcurrentHashMap<String, ConcurrentLinkedQueue<OutputRecord>> =
ConcurrentHashMap()

fun insert(filename: String, vararg records: OutputRecord) {
getFile(filename).addAll(records)
Expand Down Expand Up @@ -96,7 +98,7 @@ object MockDestinationBackend {
}

fun readFile(filename: String): List<OutputRecord> {
return getFile(filename)
return getFile(filename).toList()
}

fun deleteOldRecords(filename: String, minGenerationId: Long) {
Expand All @@ -105,8 +107,8 @@ object MockDestinationBackend {
}
}

private fun getFile(filename: String): MutableList<OutputRecord> {
return files.getOrPut(filename) { mutableListOf() }
private fun getFile(filename: String): ConcurrentLinkedQueue<OutputRecord> {
return files.getOrPut(filename) { ConcurrentLinkedQueue() }
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ import io.micronaut.context.annotation.Factory
import jakarta.inject.Singleton

class MockDestinationConfiguration : DestinationConfiguration() {
// override to 10KB instead of 200MB
override val recordBatchSizeBytes = 10 * 1024L
// Micro-batch for testing.
override val recordBatchSizeBytes = 1L
}

@Singleton
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package io.airbyte.cdk.load.command

import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog
import io.github.oshai.kotlinlogging.KotlinLogging
import io.micronaut.context.annotation.Factory
import io.micronaut.context.annotation.Secondary
import jakarta.inject.Singleton
Expand All @@ -14,6 +15,8 @@ import jakarta.inject.Singleton
* for usability.
*/
data class DestinationCatalog(val streams: List<DestinationStream> = emptyList()) {
private val log = KotlinLogging.logger {}

private val byDescriptor: Map<DestinationStream.Descriptor, DestinationStream> =
streams.associateBy { it.descriptor }

Expand All @@ -23,6 +26,7 @@ data class DestinationCatalog(val streams: List<DestinationStream> = emptyList()
"Catalog must have at least one stream: check that files are in the correct location."
)
}
log.info { "Destination catalog initialized: $streams" }
}

fun getStream(name: String, namespace: String?): DestinationStream {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*
* Copyright (c) 2024 Airbyte, Inc., all rights reserved.
*/

package io.airbyte.cdk.load.data

import java.text.Normalizer
import java.util.regex.Pattern

class Transformations {
companion object {
private const val S3_SAFE_CHARACTERS = "\\p{Alnum}/!_.*')("
private const val S3_SPECIAL_CHARACTERS = "&$@=;:+,?-"
private val S3_CHARACTER_PATTERN =
"[^${S3_SAFE_CHARACTERS}${Pattern.quote(S3_SPECIAL_CHARACTERS)}]"

fun toS3SafeCharacters(input: String): String {
return Normalizer.normalize(input, Normalizer.Form.NFKD)
.replace(
"\\p{M}".toRegex(),
"",
) // P{M} matches a code point that is not a combining mark (unicode)
.replace(S3_CHARACTER_PATTERN.toRegex(), "_")
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ data class GlobalCheckpoint(
override val destinationStats: Stats? = null,
val checkpoints: List<Checkpoint> = emptyList(),
override val additionalProperties: Map<String, Any>,
val originalTypeField: AirbyteStateMessage.AirbyteStateType? =
AirbyteStateMessage.AirbyteStateType.GLOBAL,
) : CheckpointMessage {
/** Convenience constructor, primarily intended for use in tests. */
constructor(
Expand All @@ -333,7 +335,7 @@ data class GlobalCheckpoint(
override fun asProtocolMessage(): AirbyteMessage {
val stateMessage =
AirbyteStateMessage()
.withType(AirbyteStateMessage.AirbyteStateType.GLOBAL)
.withType(originalTypeField)
.withGlobal(
AirbyteGlobalState()
.withSharedState(state)
Expand Down Expand Up @@ -433,7 +435,11 @@ class DestinationMessageFactory(
namespace = status.streamDescriptor.namespace,
name = status.streamDescriptor.name,
)
if (message.trace.type == AirbyteTraceMessage.Type.STREAM_STATUS) {
if (
(message.trace.type
?: AirbyteTraceMessage.Type.STREAM_STATUS) ==
AirbyteTraceMessage.Type.STREAM_STATUS
) {
when (status.status) {
AirbyteStreamStatus.COMPLETE ->
if (fileTransferEnabled) {
Expand All @@ -444,7 +450,7 @@ class DestinationMessageFactory(
} else {
DestinationRecordStreamComplete(
stream.descriptor,
message.trace.emittedAt.toLong()
message.trace.emittedAt?.toLong() ?: 0L
)
}
AirbyteStreamStatus.INCOMPLETE ->
Expand Down Expand Up @@ -476,6 +482,7 @@ class DestinationMessageFactory(
},
additionalProperties = message.state.additionalProperties,
)
null,
AirbyteStateMessage.AirbyteStateType.GLOBAL ->
GlobalCheckpoint(
sourceStats =
Expand All @@ -488,6 +495,7 @@ class DestinationMessageFactory(
fromAirbyteStreamState(it)
},
additionalProperties = message.state.additionalProperties,
originalTypeField = message.state.type,
)
else -> // TODO: Do we still need to handle LEGACY?
Undefined
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class DefaultDestinationMessageDeserializer(private val messageFactory: Destinat
val airbyteMessage = serialized.deserializeToClass(AirbyteMessage::class.java)
return messageFactory.fromAirbyteMessage(airbyteMessage, serialized)
} catch (t: Throwable) {
throw RuntimeException("Failed to deserialize AirbyteMessage")
// TODO: S3V2: Remove logging the message XXX
throw RuntimeException("Failed to deserialize AirbyteMessage: $serialized", t)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ abstract class StreamsCheckpointManager<T> : CheckpointManager<DestinationStream
validateAndSendMessage(head.checkpointMessage, head.streamIndexes)
globalCheckpoints.poll() // don't remove until after we've successfully sent
} else {
log.info {
"Not flushing global checkpoint with stream indexes: ${head.streamIndexes}"
}
break
}
}
Expand All @@ -176,6 +179,7 @@ abstract class StreamsCheckpointManager<T> : CheckpointManager<DestinationStream
validateAndSendMessage(nextMessage, listOf(stream.descriptor to nextIndex))
streamCheckpoints.poll() // don't remove until after we've successfully sent
} else {
log.info { "Not flushing next checkpoint for index $nextIndex" }
break
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ class DefaultStreamManager(
private fun isProcessingCompleteForState(index: Long, state: Batch.State): Boolean {
val completeRanges = rangesState[state]!!

if (index == 0L && recordCount.get() == 0L) {
return true
}

return completeRanges.encloses(Range.closedOpen(0L, index))
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import io.micronaut.context.annotation.Secondary
import jakarta.inject.Singleton
import java.util.concurrent.ConcurrentHashMap
import kotlinx.coroutines.CompletableDeferred
import kotlinx.coroutines.withTimeoutOrNull

sealed interface SyncResult

Expand Down Expand Up @@ -79,7 +80,8 @@ class DefaultSyncManager(
stream: DestinationStream.Descriptor
): StreamLoader? {
val completable = streamLoaders[stream]
return completable?.let { if (it.isCompleted) it.await() else null }
// `.isCompleted` does not work as expected here.
return completable?.let { withTimeoutOrNull(1000L) { it.await() } }
}

override suspend fun awaitAllStreamsCompletedSuccessfully(): Boolean {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ T : ScopedTask {
log.warn { "Stream task $innerTask was cancelled." }
throw e
} catch (e: Exception) {
log.error { "Caught exception in sync task $innerTask: $e" }
log.error { "Caught exception in stream task $innerTask: $e" }
handleStreamFailure(stream, e)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,12 @@ class DefaultUpdateCheckpointsTask(
when (it.value) {
is StreamCheckpointWrapped -> {
val (_, stream, index, message) = it.value
log.info { "Updating checkpoint for stream $stream with index $index" }
checkpointManager.addStreamCheckpoint(stream, index, it.replace(message))
}
is GlobalCheckpointWrapped -> {
val (_, streamIndexes, message) = it.value
log.info { "Updating global checkpoint for streams $streamIndexes" }
checkpointManager.addGlobalCheckpoint(streamIndexes, it.replace(message))
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package io.airbyte.cdk.load.test.util

import java.time.OffsetDateTime
import kotlin.test.assertEquals
import org.junit.jupiter.api.Assertions
import org.junit.jupiter.api.Test

Expand Down Expand Up @@ -118,4 +119,118 @@ class RecordDifferTest {
diff
)
}

/** Verify that the differ can sort records which are identical other than the PK */
@Test
fun testSortPk() {
val diff =
RecordDiffer(primaryKey = listOf(listOf("id")), cursor = null)
.diffRecords(
listOf(
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "foo"),
airbyteMeta = null,
),
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 2, "name" to "bar"),
airbyteMeta = null,
),
),
listOf(
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 2, "name" to "bar"),
airbyteMeta = null,
),
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "foo"),
airbyteMeta = null,
),
)
)
assertEquals(null, diff)
}

/** Verify that the differ can sort records which are identical other than the cursor */
@Test
fun testSortCursor() {
val diff =
RecordDiffer(primaryKey = listOf(listOf("id")), cursor = listOf("updated_at"))
.diffRecords(
listOf(
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "foo", "updated_at" to 1),
airbyteMeta = null,
),
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "bar", "updated_at" to 2),
airbyteMeta = null,
),
),
listOf(
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "bar", "updated_at" to 2),
airbyteMeta = null,
),
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "foo", "updated_at" to 1),
airbyteMeta = null,
),
),
)
assertEquals(null, diff)
}

/** Verify that the differ can sort records which are identical other than extractedAt */
@Test
fun testSortExtractedAt() {
val diff =
RecordDiffer(primaryKey = listOf(listOf("id")), cursor = null)
.diffRecords(
listOf(
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "foo"),
airbyteMeta = null,
),
OutputRecord(
extractedAt = 1,
generationId = 0,
data = mapOf("id" to 1, "name" to "bar"),
airbyteMeta = null,
),
),
listOf(
OutputRecord(
extractedAt = 1,
generationId = 0,
data = mapOf("id" to 1, "name" to "bar"),
airbyteMeta = null,
),
OutputRecord(
extractedAt = 0,
generationId = 0,
data = mapOf("id" to 1, "name" to "foo"),
airbyteMeta = null,
),
)
)
assertEquals(null, diff)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ object StubDestinationMessageFactory {
sourceStats = CheckpointMessage.Stats(recordCount),
checkpoints = emptyList(),
additionalProperties = emptyMap(),
originalTypeField = message.state.type,
)
}

Expand Down
Loading
Loading