Implement runner for e2e tests (#548)

* implement a runner for e2e tests

* move e2e tests to a Docker container

* integrate e2e tests into build pipelines

* add tests for multi-namespace support and logical backup jobs

* @FxKu implement the first e2e test for failovers
This commit is contained in:
Sergey Dudoladov 2019-06-05 17:07:27 +02:00 committed by GitHub
parent ec5b1d4d58
commit 69af2d60e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 507 additions and 4 deletions

3
.flake8 Normal file
View File

@ -0,0 +1,3 @@
[flake8]
exclude=.git,__pycache__
max-line-length=120

56
.gitignore vendored
View File

@ -34,3 +34,59 @@ scm-source.json
# diagrams # diagrams
*.aux *.aux
*.log *.log
# Python
# Adapted from https://github.com/github/gitignore/blob/master/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot

View File

@ -15,8 +15,9 @@ before_install:
- go get github.com/mattn/goveralls - go get github.com/mattn/goveralls
install: install:
- make deps - make deps e2e-tools e2e-build
script: script:
- hack/verify-codegen.sh - hack/verify-codegen.sh
- travis_wait 20 goveralls -service=travis-ci -package ./pkg/... -v - travis_wait 20 goveralls -service=travis-ci -package ./pkg/... -v
- make e2e-run

View File

@ -1,4 +1,4 @@
.PHONY: clean local test linux macos docker push scm-source.json .PHONY: clean local test linux macos docker push scm-source.json e2e-run e2e-tools e2e-build
BINARY ?= postgres-operator BINARY ?= postgres-operator
BUILD_FLAGS ?= -v BUILD_FLAGS ?= -v
@ -34,7 +34,7 @@ ifdef CDP_PULL_REQUEST_NUMBER
CDP_TAG := -${CDP_BUILD_VERSION} CDP_TAG := -${CDP_BUILD_VERSION}
endif endif
KIND_PATH := $(GOPATH)/bin
PATH := $(GOPATH)/bin:$(PATH) PATH := $(GOPATH)/bin:$(PATH)
SHELL := env PATH=$(PATH) $(SHELL) SHELL := env PATH=$(PATH) $(SHELL)
@ -91,3 +91,16 @@ deps:
test: test:
hack/verify-codegen.sh hack/verify-codegen.sh
@go test ./... @go test ./...
e2e-build:
docker build --tag="postgres-operator-e2e-tests" -f e2e/Dockerfile .
e2e-tools:
# install pinned version of 'kind'
# leave the name as is to avoid overwriting official binary named `kind`
wget https://github.com/kubernetes-sigs/kind/releases/download/v0.3.0/kind-linux-amd64
chmod +x kind-linux-amd64
mv kind-linux-amd64 $(KIND_PATH)
e2e-run: docker
e2e/run.sh

View File

@ -11,7 +11,7 @@ pipeline:
apt-get update apt-get update
- desc: 'Install required build software' - desc: 'Install required build software'
cmd: | cmd: |
apt-get install -y make git apt-transport-https ca-certificates curl build-essential apt-get install -y make git apt-transport-https ca-certificates curl build-essential python3 python3-pip
- desc: 'Install go' - desc: 'Install go'
cmd: | cmd: |
cd /tmp cd /tmp
@ -41,6 +41,10 @@ pipeline:
export PATH=$PATH:$HOME/go/bin export PATH=$PATH:$HOME/go/bin
cd $OPERATOR_TOP_DIR/postgres-operator cd $OPERATOR_TOP_DIR/postgres-operator
go test ./... go test ./...
- desc: 'Run e2e tests'
cmd: |
cd $OPERATOR_TOP_DIR/postgres-operator
make e2e-tools e2e-build e2e-run
- desc: 'Push docker image' - desc: 'Push docker image'
cmd: | cmd: |
export PATH=$PATH:$HOME/go/bin export PATH=$PATH:$HOME/go/bin

View File

@ -322,6 +322,16 @@ Then you can for example check the Patroni logs:
kubectl logs acid-minimal-cluster-0 kubectl logs acid-minimal-cluster-0
``` ```
## End-to-end tests
The operator provides reference e2e (end-to-end) tests to ensure various infra parts work smoothly together.
Each e2e execution tests a Postgres operator image built from the current git branch. The test runner starts a [kind](https://kind.sigs.k8s.io/) (local k8s) cluster and Docker container with tests. The k8s API client from within the container connects to the `kind` cluster using the standard Docker `bridge` network.
The tests utilize examples from `/manifests` (ConfigMap is used for the operator configuration) to avoid maintaining yet another set of configuration files. The kind cluster is deleted if tests complete successfully.
End-to-end tests are executed automatically during builds; to invoke them locally use `make e2e-run` from the project's top directory. Run `make e2e-tools e2e-build` to install `kind` and build the tests' image locally before the first run.
End-to-end tests are written in Python and use `flake8` for code quality. Please run flake8 [before submitting a PR](http://flake8.pycqa.org/en/latest/user/using-hooks.html).
## Introduce additional configuration parameters ## Introduce additional configuration parameters
In the case you want to add functionality to the operator that shall be In the case you want to add functionality to the operator that shall be

22
e2e/Dockerfile Normal file
View File

@ -0,0 +1,22 @@
FROM ubuntu:18.04
LABEL maintainer="Team ACID @ Zalando <team-acid@zalando.de>"
WORKDIR /e2e
COPY manifests ./manifests
COPY e2e/requirements.txt e2e/tests ./
RUN apt-get update \
&& apt-get install --no-install-recommends -y \
python3 \
python3-setuptools \
python3-pip \
curl \
&& pip3 install --no-cache-dir -r requirements.txt \
&& curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.14.0/bin/linux/amd64/kubectl \
&& chmod +x ./kubectl \
&& mv ./kubectl /usr/local/bin/kubectl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
CMD ["python3", "-m", "unittest", "discover", "--start-directory", ".", "-v"]

View File

@ -0,0 +1,6 @@
kind: Cluster
apiVersion: kind.sigs.k8s.io/v1alpha3
nodes:
- role: control-plane
- role: worker
- role: worker

3
e2e/requirements.txt Normal file
View File

@ -0,0 +1,3 @@
kubernetes==9.0.0
timeout_decorator==0.4.1
pyyaml==5.1

58
e2e/run.sh Executable file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env bash
# enable unofficial bash strict mode
set -o errexit
set -o nounset
set -o pipefail
IFS=$'\n\t'
readonly cluster_name="postgres-operator-e2e-tests"
readonly operator_image=$(docker images --filter=reference="registry.opensource.zalan.do/acid/postgres-operator" --format "{{.Repository}}:{{.Tag}}" | head -1)
readonly e2e_test_image=${cluster_name}
readonly kubeconfig_path="/tmp/kind-config-${cluster_name}"
function start_kind(){
# avoid interference with previous test runs
if [[ $(kind-linux-amd64 get clusters | grep "^${cluster_name}*") != "" ]]
then
kind-linux-amd64 delete cluster --name ${cluster_name}
fi
kind-linux-amd64 create cluster --name ${cluster_name} --config ./e2e/kind-cluster-postgres-operator-e2e-tests.yaml
kind-linux-amd64 load docker-image "${operator_image}" --name ${cluster_name}
KUBECONFIG="$(kind-linux-amd64 get kubeconfig-path --name=${cluster_name})"
export KUBECONFIG
}
function set_kind_api_server_ip(){
# use the actual kubeconfig to connect to the 'kind' API server
# but update the IP address of the API server to the one from the Docker 'bridge' network
cp "${KUBECONFIG}" /tmp
readonly local kind_api_server_port=6443 # well-known in the 'kind' codebase
readonly local kind_api_server=$(docker inspect --format "{{ .NetworkSettings.IPAddress }}:${kind_api_server_port}" "${cluster_name}"-control-plane)
sed -i "s/server.*$/server: https:\/\/$kind_api_server/g" "${kubeconfig_path}"
}
function run_tests(){
docker run --rm --mount type=bind,source="$(readlink -f ${kubeconfig_path})",target=/root/.kube/config -e OPERATOR_IMAGE="${operator_image}" "${e2e_test_image}"
}
function clean_up(){
unset KUBECONFIG
kind-linux-amd64 delete cluster --name ${cluster_name}
rm -rf ${kubeconfig_path}
}
function main(){
trap "clean_up" QUIT TERM EXIT
start_kind
set_kind_api_server_ip
run_tests
exit 0
}
main "$@"

327
e2e/tests/test_e2e.py Normal file
View File

@ -0,0 +1,327 @@
import unittest
import time
import timeout_decorator
import subprocess
import warnings
import os
import yaml
from kubernetes import client, config
class EndToEndTestCase(unittest.TestCase):
'''
Test interaction of the operator with multiple k8s components.
'''
# `kind` pods may stuck in the `Terminating` phase for a few minutes; hence high test timeout
TEST_TIMEOUT_SEC = 600
@classmethod
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def setUpClass(cls):
'''
Deploy operator to a "kind" cluster created by /e2e/run.sh using examples from /manifests.
This operator deployment is to be shared among all tests.
/e2e/run.sh deletes the 'kind' cluster after successful run along with all operator-related entities.
In the case of test failure the cluster will stay to enable manual examination;
next invocation of "make e2e-run" will re-create it.
'''
# set a single k8s wrapper for all tests
k8s = cls.k8s = K8s()
# operator deploys pod service account there on start up
# needed for test_multi_namespace_support()
cls.namespace = "test"
v1_namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=cls.namespace))
k8s.api.core_v1.create_namespace(v1_namespace)
# submit the most recent operator image built on the Docker host
with open("manifests/postgres-operator.yaml", 'r+') as f:
operator_deployment = yaml.safe_load(f)
operator_deployment["spec"]["template"]["spec"]["containers"][0]["image"] = os.environ['OPERATOR_IMAGE']
yaml.dump(operator_deployment, f, Dumper=yaml.Dumper)
for filename in ["operator-service-account-rbac.yaml",
"configmap.yaml",
"postgres-operator.yaml"]:
k8s.create_with_kubectl("manifests/" + filename)
k8s.wait_for_operator_pod_start()
actual_operator_image = k8s.api.core_v1.list_namespaced_pod(
'default', label_selector='name=postgres-operator').items[0].spec.containers[0].image
print("Tested operator image: {}".format(actual_operator_image)) # shows up after tests finish
k8s.create_with_kubectl("manifests/minimal-postgres-manifest.yaml")
k8s.wait_for_pod_start('spilo-role=master')
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_multi_namespace_support(self):
'''
Create a customized Postgres cluster in a non-default namespace.
'''
k8s = self.k8s
with open("manifests/complete-postgres-manifest.yaml", 'r+') as f:
pg_manifest = yaml.safe_load(f)
pg_manifest["metadata"]["namespace"] = self.namespace
yaml.dump(pg_manifest, f, Dumper=yaml.Dumper)
k8s.create_with_kubectl("manifests/complete-postgres-manifest.yaml")
k8s.wait_for_pod_start("spilo-role=master", self.namespace)
self.assert_master_is_unique(self.namespace, version="acid-test-cluster")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_scaling(self):
"""
Scale up from 2 to 3 and back to 2 pods by updating the Postgres manifest at runtime.
"""
k8s = self.k8s
labels = "version=acid-minimal-cluster"
k8s.wait_for_pg_to_scale(3)
self.assertEqual(3, k8s.count_pods_with_label(labels))
self.assert_master_is_unique()
k8s.wait_for_pg_to_scale(2)
self.assertEqual(2, k8s.count_pods_with_label(labels))
self.assert_master_is_unique()
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_taint_based_eviction(self):
"""
Add taint "postgres=:NoExecute" to node with master. This must cause a failover.
"""
k8s = self.k8s
cluster_label = 'version=acid-minimal-cluster'
# get nodes of master and replica(s) (expected target of new master)
current_master_node, failover_targets = k8s.get_pg_nodes(cluster_label)
num_replicas = len(failover_targets)
# if all pods live on the same node, failover will happen to other worker(s)
failover_targets = [x for x in failover_targets if x != current_master_node]
if len(failover_targets) == 0:
nodes = k8s.api.core_v1.list_node()
for n in nodes.items:
if "node-role.kubernetes.io/master" not in n.metadata.labels and n.metadata.name != current_master_node:
failover_targets.append(n.metadata.name)
# taint node with postgres=:NoExecute to force failover
body = {
"spec": {
"taints": [
{
"effect": "NoExecute",
"key": "postgres"
}
]
}
}
# patch node and test if master is failing over to one of the expected nodes
k8s.api.core_v1.patch_node(current_master_node, body)
k8s.wait_for_master_failover(failover_targets)
k8s.wait_for_pod_start('spilo-role=replica')
new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
self.assertNotEqual(current_master_node, new_master_node,
"Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
self.assertEqual(num_replicas, len(new_replica_nodes),
"Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
self.assert_master_is_unique()
# undo the tainting
body = {
"spec": {
"taints": []
}
}
k8s.api.core_v1.patch_node(new_master_node, body)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_logical_backup_cron_job(self):
"""
Ensure we can (a) create the cron job at user request for a specific PG cluster
(b) update the cluster-wide image for the logical backup pod
(c) delete the job at user request
Limitations:
(a) Does not run the actual batch job because there is no S3 mock to upload backups to
(b) Assumes 'acid-minimal-cluster' exists as defined in setUp
"""
k8s = self.k8s
# create the cron job
schedule = "7 7 7 7 *"
pg_patch_enable_backup = {
"spec": {
"enableLogicalBackup": True,
"logicalBackupSchedule": schedule
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_enable_backup)
k8s.wait_for_logical_backup_job_creation()
jobs = k8s.get_logical_backup_job().items
self.assertEqual(1, len(jobs), "Expected 1 logical backup job, found {}".format(len(jobs)))
job = jobs[0]
self.assertEqual(job.metadata.name, "logical-backup-acid-minimal-cluster",
"Expected job name {}, found {}"
.format("logical-backup-acid-minimal-cluster", job.metadata.name))
self.assertEqual(job.spec.schedule, schedule,
"Expected {} schedule, found {}"
.format(schedule, job.spec.schedule))
# update the cluster-wide image of the logical backup pod
image = "test-image-name"
config_map_patch = {
"data": {
"logical_backup_docker_image": image,
}
}
k8s.api.core_v1.patch_namespaced_config_map("postgres-operator", "default", config_map_patch)
operator_pod = k8s.api.core_v1.list_namespaced_pod(
'default', label_selector="name=postgres-operator").items[0].metadata.name
k8s.api.core_v1.delete_namespaced_pod(operator_pod, "default") # restart reloads the conf
k8s.wait_for_operator_pod_start()
jobs = k8s.get_logical_backup_job().items
actual_image = jobs[0].spec.job_template.spec.template.spec.containers[0].image
self.assertEqual(actual_image, image,
"Expected job image {}, found {}".format(image, actual_image))
# delete the logical backup cron job
pg_patch_disable_backup = {
"spec": {
"enableLogicalBackup": False,
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_disable_backup)
k8s.wait_for_logical_backup_job_deletion()
jobs = k8s.get_logical_backup_job().items
self.assertEqual(0, len(jobs),
"Expected 0 logical backup jobs, found {}".format(len(jobs)))
def assert_master_is_unique(self, namespace='default', version="acid-minimal-cluster"):
"""
Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
To be called manually after operations that affect pods
"""
k8s = self.k8s
labels = 'spilo-role=master,version=' + version
num_of_master_pods = k8s.count_pods_with_label(labels, namespace)
self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods))
class K8sApi:
def __init__(self):
# https://github.com/kubernetes-client/python/issues/309
warnings.simplefilter("ignore", ResourceWarning)
self.config = config.load_kube_config()
self.k8s_client = client.ApiClient()
self.core_v1 = client.CoreV1Api()
self.apps_v1 = client.AppsV1Api()
self.batch_v1_beta1 = client.BatchV1beta1Api()
self.custom_objects_api = client.CustomObjectsApi()
class K8s:
'''
Wraps around K8 api client and helper methods.
'''
RETRY_TIMEOUT_SEC = 5
def __init__(self):
self.api = K8sApi()
def get_pg_nodes(self, pg_cluster_name, namespace='default'):
master_pod_node = ''
replica_pod_nodes = []
podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=pg_cluster_name)
for pod in podsList.items:
if pod.metadata.labels.get('spilo-role') == 'master':
master_pod_node = pod.spec.node_name
elif pod.metadata.labels.get('spilo-role') == 'replica':
replica_pod_nodes.append(pod.spec.node_name)
return master_pod_node, replica_pod_nodes
def wait_for_operator_pod_start(self):
self. wait_for_pod_start("name=postgres-operator")
# HACK operator must register CRD / add existing PG clusters after pod start up
# for local execution ~ 10 seconds suffices
time.sleep(60)
def wait_for_pod_start(self, pod_labels, namespace='default'):
pod_phase = 'No pod running'
while pod_phase != 'Running':
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=pod_labels).items
if pods:
pod_phase = pods[0].status.phase
time.sleep(self.RETRY_TIMEOUT_SEC)
def wait_for_pg_to_scale(self, number_of_instances, namespace='default'):
body = {
"spec": {
"numberOfInstances": number_of_instances
}
}
_ = self.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", namespace, "postgresqls", "acid-minimal-cluster", body)
labels = 'version=acid-minimal-cluster'
while self.count_pods_with_label(labels) != number_of_instances:
time.sleep(self.RETRY_TIMEOUT_SEC)
def count_pods_with_label(self, labels, namespace='default'):
return len(self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items)
def wait_for_master_failover(self, expected_master_nodes, namespace='default'):
pod_phase = 'Failing over'
new_master_node = ''
labels = 'spilo-role=master,version=acid-minimal-cluster'
while (pod_phase != 'Running') or (new_master_node not in expected_master_nodes):
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
if pods:
new_master_node = pods[0].spec.node_name
pod_phase = pods[0].status.phase
time.sleep(self.RETRY_TIMEOUT_SEC)
def get_logical_backup_job(self, namespace='default'):
return self.api.batch_v1_beta1.list_namespaced_cron_job(namespace, label_selector="application=spilo")
def wait_for_logical_backup_job(self, expected_num_of_jobs):
while (len(self.get_logical_backup_job().items) != expected_num_of_jobs):
time.sleep(self.RETRY_TIMEOUT_SEC)
def wait_for_logical_backup_job_deletion(self):
self.wait_for_logical_backup_job(expected_num_of_jobs=0)
def wait_for_logical_backup_job_creation(self):
self.wait_for_logical_backup_job(expected_num_of_jobs=1)
def create_with_kubectl(self, path):
subprocess.run(["kubectl", "create", "-f", path])
if __name__ == '__main__':
unittest.main()