fix stream duplication on operator restart (#2733)

* fix stream duplication on operator restart
* add try except to streams e2e test
This commit is contained in:
Felix Kunde 2024-08-20 14:38:07 +02:00 committed by GitHub
parent c7ee34ed12
commit 2f7e3ee847
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 224 additions and 151 deletions

View File

@ -2131,130 +2131,136 @@ class EndToEndTestCase(unittest.TestCase):
verbs=["create", "delete", "deletecollection", "get", "list", "patch", "update", "watch"] verbs=["create", "delete", "deletecollection", "get", "list", "patch", "update", "watch"]
) )
cluster_role.rules.append(fes_cluster_role_rule) cluster_role.rules.append(fes_cluster_role_rule)
k8s.api.rbac_api.patch_cluster_role("postgres-operator", cluster_role)
# create a table in one of the database of acid-minimal-cluster try:
create_stream_table = """ k8s.api.rbac_api.patch_cluster_role("postgres-operator", cluster_role)
CREATE TABLE test_table (id int, payload jsonb);
"""
self.query_database(leader.metadata.name, "foo", create_stream_table)
# update the manifest with the streams section # create a table in one of the database of acid-minimal-cluster
patch_streaming_config = { create_stream_table = """
"spec": { CREATE TABLE test_table (id int, payload jsonb);
"patroni": { """
"slots": { self.query_database(leader.metadata.name, "foo", create_stream_table)
"manual_slot": {
"type": "physical" # update the manifest with the streams section
} patch_streaming_config = {
} "spec": {
}, "patroni": {
"streams": [ "slots": {
{ "manual_slot": {
"applicationId": "test-app", "type": "physical"
"batchSize": 100,
"database": "foo",
"enableRecovery": True,
"tables": {
"test_table": {
"eventType": "test-event",
"idColumn": "id",
"payloadColumn": "payload",
"recoveryEventType": "test-event-dlq"
} }
} }
}, },
{ "streams": [
"applicationId": "test-app2", {
"batchSize": 100, "applicationId": "test-app",
"database": "foo", "batchSize": 100,
"enableRecovery": True, "database": "foo",
"tables": { "enableRecovery": True,
"test_non_exist_table": { "tables": {
"eventType": "test-event", "test_table": {
"idColumn": "id", "eventType": "test-event",
"payloadColumn": "payload", "idColumn": "id",
"recoveryEventType": "test-event-dlq" "payloadColumn": "payload",
"recoveryEventType": "test-event-dlq"
}
}
},
{
"applicationId": "test-app2",
"batchSize": 100,
"database": "foo",
"enableRecovery": True,
"tables": {
"test_non_exist_table": {
"eventType": "test-event",
"idColumn": "id",
"payloadColumn": "payload",
"recoveryEventType": "test-event-dlq"
}
} }
} }
} ]
] }
} }
} k8s.api.custom_objects_api.patch_namespaced_custom_object(
k8s.api.custom_objects_api.patch_namespaced_custom_object( 'acid.zalan.do', 'v1', 'default', 'postgresqls', 'acid-minimal-cluster', patch_streaming_config)
'acid.zalan.do', 'v1', 'default', 'postgresqls', 'acid-minimal-cluster', patch_streaming_config) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# check if publication, slot, and fes resource are created # check if publication, slot, and fes resource are created
get_publication_query = """ get_publication_query = """
SELECT * FROM pg_publication WHERE pubname = 'fes_foo_test_app'; SELECT * FROM pg_publication WHERE pubname = 'fes_foo_test_app';
""" """
get_slot_query = """ get_slot_query = """
SELECT * FROM pg_replication_slots WHERE slot_name = 'fes_foo_test_app'; SELECT * FROM pg_replication_slots WHERE slot_name = 'fes_foo_test_app';
""" """
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_publication_query)), 1, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_publication_query)), 1,
"Publication is not created", 10, 5) "Publication is not created", 10, 5)
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_slot_query)), 1, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_slot_query)), 1,
"Replication slot is not created", 10, 5) "Replication slot is not created", 10, 5)
self.eventuallyEqual(lambda: len(k8s.api.custom_objects_api.list_namespaced_custom_object( self.eventuallyEqual(lambda: len(k8s.api.custom_objects_api.list_namespaced_custom_object(
"zalando.org", "v1", "default", "fabriceventstreams", label_selector="cluster-name=acid-minimal-cluster")["items"]), 1, "zalando.org", "v1", "default", "fabriceventstreams", label_selector="cluster-name=acid-minimal-cluster")["items"]), 1,
"Could not find Fabric Event Stream resource", 10, 5) "Could not find Fabric Event Stream resource", 10, 5)
# check if the non-existing table in the stream section does not create a publication and slot # check if the non-existing table in the stream section does not create a publication and slot
get_publication_query_not_exist_table = """ get_publication_query_not_exist_table = """
SELECT * FROM pg_publication WHERE pubname = 'fes_foo_test_app2'; SELECT * FROM pg_publication WHERE pubname = 'fes_foo_test_app2';
""" """
get_slot_query_not_exist_table = """ get_slot_query_not_exist_table = """
SELECT * FROM pg_replication_slots WHERE slot_name = 'fes_foo_test_app2'; SELECT * FROM pg_replication_slots WHERE slot_name = 'fes_foo_test_app2';
""" """
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_publication_query_not_exist_table)), 0, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_publication_query_not_exist_table)), 0,
"Publication is created for non-existing tables", 10, 5) "Publication is created for non-existing tables", 10, 5)
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_slot_query_not_exist_table)), 0, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_slot_query_not_exist_table)), 0,
"Replication slot is created for non-existing tables", 10, 5) "Replication slot is created for non-existing tables", 10, 5)
# grant create and ownership of test_table to foo_user, reset search path to default # grant create and ownership of test_table to foo_user, reset search path to default
grant_permission_foo_user = """ grant_permission_foo_user = """
GRANT CREATE ON DATABASE foo TO foo_user; GRANT CREATE ON DATABASE foo TO foo_user;
ALTER TABLE test_table OWNER TO foo_user; ALTER TABLE test_table OWNER TO foo_user;
ALTER ROLE foo_user RESET search_path; ALTER ROLE foo_user RESET search_path;
""" """
self.query_database(leader.metadata.name, "foo", grant_permission_foo_user) self.query_database(leader.metadata.name, "foo", grant_permission_foo_user)
# non-postgres user creates a publication # non-postgres user creates a publication
create_nonstream_publication = """ create_nonstream_publication = """
CREATE PUBLICATION mypublication FOR TABLE test_table; CREATE PUBLICATION mypublication FOR TABLE test_table;
""" """
self.query_database_with_user(leader.metadata.name, "foo", create_nonstream_publication, "foo_user") self.query_database_with_user(leader.metadata.name, "foo", create_nonstream_publication, "foo_user")
# remove the streams section from the manifest # remove the streams section from the manifest
patch_streaming_config_removal = { patch_streaming_config_removal = {
"spec": { "spec": {
"streams": [] "streams": []
}
} }
} k8s.api.custom_objects_api.patch_namespaced_custom_object(
k8s.api.custom_objects_api.patch_namespaced_custom_object( 'acid.zalan.do', 'v1', 'default', 'postgresqls', 'acid-minimal-cluster', patch_streaming_config_removal)
'acid.zalan.do', 'v1', 'default', 'postgresqls', 'acid-minimal-cluster', patch_streaming_config_removal) self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
self.eventuallyEqual(lambda: k8s.get_operator_state(), {"0": "idle"}, "Operator does not get in sync")
# check if publication, slot, and fes resource are removed # check if publication, slot, and fes resource are removed
self.eventuallyEqual(lambda: len(k8s.api.custom_objects_api.list_namespaced_custom_object( self.eventuallyEqual(lambda: len(k8s.api.custom_objects_api.list_namespaced_custom_object(
"zalando.org", "v1", "default", "fabriceventstreams", label_selector="cluster-name=acid-minimal-cluster")["items"]), 0, "zalando.org", "v1", "default", "fabriceventstreams", label_selector="cluster-name=acid-minimal-cluster")["items"]), 0,
'Could not delete Fabric Event Stream resource', 10, 5) 'Could not delete Fabric Event Stream resource', 10, 5)
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_publication_query)), 0, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_publication_query)), 0,
"Publication is not deleted", 10, 5) "Publication is not deleted", 10, 5)
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_slot_query)), 0, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_slot_query)), 0,
"Replication slot is not deleted", 10, 5) "Replication slot is not deleted", 10, 5)
# check the manual_slot and mypublication should not get deleted # check the manual_slot and mypublication should not get deleted
get_manual_slot_query = """ get_manual_slot_query = """
SELECT * FROM pg_replication_slots WHERE slot_name = 'manual_slot'; SELECT * FROM pg_replication_slots WHERE slot_name = 'manual_slot';
""" """
get_nonstream_publication_query = """ get_nonstream_publication_query = """
SELECT * FROM pg_publication WHERE pubname = 'mypublication'; SELECT * FROM pg_publication WHERE pubname = 'mypublication';
""" """
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "postgres", get_manual_slot_query)), 1, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "postgres", get_manual_slot_query)), 1,
"Slot defined in patroni config is deleted", 10, 5) "Slot defined in patroni config is deleted", 10, 5)
self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_nonstream_publication_query)), 1, self.eventuallyEqual(lambda: len(self.query_database(leader.metadata.name, "foo", get_nonstream_publication_query)), 1,
"Publication defined not in stream section is deleted", 10, 5) "Publication defined not in stream section is deleted", 10, 5)
except timeout_decorator.TimeoutError:
print('Operator log: {}'.format(k8s.get_operator_log()))
raise
@timeout_decorator.timeout(TEST_TIMEOUT_SEC) @timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_taint_based_eviction(self): def test_taint_based_eviction(self):

View File

@ -433,34 +433,55 @@ func hasSlotsInSync(appId string, databaseSlots map[string]map[string]zalandov1.
} }
func (c *Cluster) syncStream(appId string) error { func (c *Cluster) syncStream(appId string) error {
var (
streams *zalandov1.FabricEventStreamList
err error
)
c.setProcessName("syncing stream with applicationId %s", appId)
c.logger.Debugf("syncing stream with applicationId %s", appId)
listOptions := metav1.ListOptions{LabelSelector: c.labelsSet(true).String()}
streams, err = c.KubeClient.FabricEventStreams(c.Namespace).List(context.TODO(), listOptions)
if err != nil {
return fmt.Errorf("could not list of FabricEventStreams for applicationId %s: %v", appId, err)
}
streamExists := false streamExists := false
// update stream when it exists and EventStreams array differs for _, stream := range streams.Items {
for _, stream := range c.Streams { if stream.Spec.ApplicationId != appId {
if appId == stream.Spec.ApplicationId { continue
streamExists = true }
desiredStreams := c.generateFabricEventStream(appId) if streamExists {
if !reflect.DeepEqual(stream.ObjectMeta.OwnerReferences, desiredStreams.ObjectMeta.OwnerReferences) { c.logger.Warningf("more than one event stream with applicationId %s found, delete it", appId)
c.logger.Infof("owner references of event streams with applicationId %s do not match the current ones", appId) if err = c.KubeClient.FabricEventStreams(stream.ObjectMeta.Namespace).Delete(context.TODO(), stream.ObjectMeta.Name, metav1.DeleteOptions{}); err != nil {
stream.ObjectMeta.OwnerReferences = desiredStreams.ObjectMeta.OwnerReferences c.logger.Errorf("could not delete event stream %q with applicationId %s: %v", stream.ObjectMeta.Name, appId, err)
c.setProcessName("updating event streams with applicationId %s", appId) } else {
stream, err := c.KubeClient.FabricEventStreams(stream.Namespace).Update(context.TODO(), stream, metav1.UpdateOptions{}) c.logger.Infof("redundant event stream %q with applicationId %s has been successfully deleted", stream.ObjectMeta.Name, appId)
if err != nil {
return fmt.Errorf("could not update event streams with applicationId %s: %v", appId, err)
}
c.Streams[appId] = stream
}
if match, reason := c.compareStreams(stream, desiredStreams); !match {
c.logger.Debugf("updating event streams with applicationId %s: %s", appId, reason)
desiredStreams.ObjectMeta = stream.ObjectMeta
updatedStream, err := c.updateStreams(desiredStreams)
if err != nil {
return fmt.Errorf("failed updating event streams %s with applicationId %s: %v", stream.Name, appId, err)
}
c.Streams[appId] = updatedStream
c.logger.Infof("event streams %q with applicationId %s have been successfully updated", updatedStream.Name, appId)
} }
continue continue
} }
streamExists = true
desiredStreams := c.generateFabricEventStream(appId)
if !reflect.DeepEqual(stream.ObjectMeta.OwnerReferences, desiredStreams.ObjectMeta.OwnerReferences) {
c.logger.Infof("owner references of event streams with applicationId %s do not match the current ones", appId)
stream.ObjectMeta.OwnerReferences = desiredStreams.ObjectMeta.OwnerReferences
c.setProcessName("updating event streams with applicationId %s", appId)
stream, err := c.KubeClient.FabricEventStreams(stream.Namespace).Update(context.TODO(), &stream, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("could not update event streams with applicationId %s: %v", appId, err)
}
c.Streams[appId] = stream
}
if match, reason := c.compareStreams(&stream, desiredStreams); !match {
c.logger.Debugf("updating event streams with applicationId %s: %s", appId, reason)
desiredStreams.ObjectMeta = stream.ObjectMeta
updatedStream, err := c.updateStreams(desiredStreams)
if err != nil {
return fmt.Errorf("failed updating event streams %s with applicationId %s: %v", stream.Name, appId, err)
}
c.Streams[appId] = updatedStream
c.logger.Infof("event streams %q with applicationId %s have been successfully updated", updatedStream.Name, appId)
}
} }
if !streamExists { if !streamExists {

View File

@ -2,6 +2,7 @@ package cluster
import ( import (
"fmt" "fmt"
"reflect"
"strings" "strings"
"context" "context"
@ -87,6 +88,11 @@ var (
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-12345", clusterName), Name: fmt.Sprintf("%s-12345", clusterName),
Namespace: namespace, Namespace: namespace,
Labels: map[string]string{
"application": "spilo",
"cluster-name": fmt.Sprintf("%s-2", clusterName),
"team": "acid",
},
OwnerReferences: []metav1.OwnerReference{ OwnerReferences: []metav1.OwnerReference{
metav1.OwnerReference{ metav1.OwnerReference{
APIVersion: "apps/v1", APIVersion: "apps/v1",
@ -432,12 +438,8 @@ func TestGenerateFabricEventStream(t *testing.T) {
cluster.Name = clusterName cluster.Name = clusterName
cluster.Namespace = namespace cluster.Namespace = namespace
// create statefulset to have ownerReference for streams
_, err := cluster.createStatefulSet()
assert.NoError(t, err)
// create the streams // create the streams
err = cluster.syncStream(appId) err := cluster.syncStream(appId)
assert.NoError(t, err) assert.NoError(t, err)
// compare generated stream with expected stream // compare generated stream with expected stream
@ -451,11 +453,7 @@ func TestGenerateFabricEventStream(t *testing.T) {
} }
streams, err := cluster.KubeClient.FabricEventStreams(namespace).List(context.TODO(), listOptions) streams, err := cluster.KubeClient.FabricEventStreams(namespace).List(context.TODO(), listOptions)
assert.NoError(t, err) assert.NoError(t, err)
assert.Equalf(t, 1, len(streams.Items), "unexpected number of streams found: got %d, but expected only one", len(streams.Items))
// check if there is only one stream
if len(streams.Items) > 1 {
t.Errorf("too many stream CRDs found: got %d, but expected only one", len(streams.Items))
}
// compare stream returned from API with expected stream // compare stream returned from API with expected stream
if match, _ := cluster.compareStreams(&streams.Items[0], fes); !match { if match, _ := cluster.compareStreams(&streams.Items[0], fes); !match {
@ -468,11 +466,7 @@ func TestGenerateFabricEventStream(t *testing.T) {
streams, err = cluster.KubeClient.FabricEventStreams(namespace).List(context.TODO(), listOptions) streams, err = cluster.KubeClient.FabricEventStreams(namespace).List(context.TODO(), listOptions)
assert.NoError(t, err) assert.NoError(t, err)
assert.Equalf(t, 1, len(streams.Items), "unexpected number of streams found: got %d, but expected only one", len(streams.Items))
// check if there is still only one stream
if len(streams.Items) > 1 {
t.Errorf("too many stream CRDs found after sync: got %d, but expected only one", len(streams.Items))
}
// compare stream resturned from API with generated stream // compare stream resturned from API with generated stream
if match, _ := cluster.compareStreams(&streams.Items[0], result); !match { if match, _ := cluster.compareStreams(&streams.Items[0], result); !match {
@ -493,6 +487,62 @@ func newFabricEventStream(streams []zalandov1.EventStream, annotations map[strin
} }
} }
func TestSyncStreams(t *testing.T) {
pg.Name = fmt.Sprintf("%s-2", pg.Name)
var cluster = New(
Config{
OpConfig: config.Config{
PodManagementPolicy: "ordered_ready",
Resources: config.Resources{
ClusterLabels: map[string]string{"application": "spilo"},
ClusterNameLabel: "cluster-name",
DefaultCPURequest: "300m",
DefaultCPULimit: "300m",
DefaultMemoryRequest: "300Mi",
DefaultMemoryLimit: "300Mi",
EnableOwnerReferences: util.True(),
PodRoleLabel: "spilo-role",
},
},
}, client, pg, logger, eventRecorder)
_, err := cluster.KubeClient.Postgresqls(namespace).Create(
context.TODO(), &pg, metav1.CreateOptions{})
assert.NoError(t, err)
// create the stream
err = cluster.syncStream(appId)
assert.NoError(t, err)
// create a second stream with same spec but with different name
createdStream, err := cluster.KubeClient.FabricEventStreams(namespace).Create(
context.TODO(), fes, metav1.CreateOptions{})
assert.NoError(t, err)
assert.Equal(t, createdStream.Spec.ApplicationId, appId)
// check that two streams exist
listOptions := metav1.ListOptions{
LabelSelector: cluster.labelsSet(true).String(),
}
streams, err := cluster.KubeClient.FabricEventStreams(namespace).List(context.TODO(), listOptions)
assert.NoError(t, err)
assert.Equalf(t, 2, len(streams.Items), "unexpected number of streams found: got %d, but expected only 2", len(streams.Items))
// sync the stream which should remove the redundant stream
err = cluster.syncStream(appId)
assert.NoError(t, err)
// check that only one stream remains after sync
streams, err = cluster.KubeClient.FabricEventStreams(namespace).List(context.TODO(), listOptions)
assert.NoError(t, err)
assert.Equalf(t, 1, len(streams.Items), "unexpected number of streams found: got %d, but expected only 1", len(streams.Items))
// check owner references
if !reflect.DeepEqual(streams.Items[0].OwnerReferences, cluster.ownerReferences()) {
t.Errorf("unexpected owner references, expected %#v, got %#v", cluster.ownerReferences(), streams.Items[0].OwnerReferences)
}
}
func TestSameStreams(t *testing.T) { func TestSameStreams(t *testing.T) {
testName := "TestSameStreams" testName := "TestSameStreams"
annotationsA := map[string]string{"owned-by": "acid"} annotationsA := map[string]string{"owned-by": "acid"}
@ -606,8 +656,8 @@ func TestSameStreams(t *testing.T) {
} }
} }
func TestUpdateFabricEventStream(t *testing.T) { func TestUpdateStreams(t *testing.T) {
pg.Name = fmt.Sprintf("%s-2", pg.Name) pg.Name = fmt.Sprintf("%s-3", pg.Name)
var cluster = New( var cluster = New(
Config{ Config{
OpConfig: config.Config{ OpConfig: config.Config{
@ -628,11 +678,7 @@ func TestUpdateFabricEventStream(t *testing.T) {
context.TODO(), &pg, metav1.CreateOptions{}) context.TODO(), &pg, metav1.CreateOptions{})
assert.NoError(t, err) assert.NoError(t, err)
// create statefulset to have ownerReference for streams // create the stream
_, err = cluster.createStatefulSet()
assert.NoError(t, err)
// now create the stream
err = cluster.syncStream(appId) err = cluster.syncStream(appId)
assert.NoError(t, err) assert.NoError(t, err)