Implement runner for e2e tests (#548)

* implement a runner for e2e tests

* move e2e tests to a Docker container

* integrate e2e tests into build pipelines

* add tests for multi-namespace support and logical backup jobs

* @FxKu implement the first e2e test for failovers
This commit is contained in:
Sergey Dudoladov 2019-06-05 17:07:27 +02:00 committed by GitHub
parent ec5b1d4d58
commit 69af2d60e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 507 additions and 4 deletions

3
.flake8 Normal file
View File

@ -0,0 +1,3 @@
[flake8]
exclude=.git,__pycache__
max-line-length=120

56
.gitignore vendored
View File

@ -34,3 +34,59 @@ scm-source.json
# diagrams
*.aux
*.log
# Python
# Adapted from https://github.com/github/gitignore/blob/master/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot

View File

@ -15,8 +15,9 @@ before_install:
- go get github.com/mattn/goveralls
install:
- make deps
- make deps e2e-tools e2e-build
script:
- hack/verify-codegen.sh
- travis_wait 20 goveralls -service=travis-ci -package ./pkg/... -v
- make e2e-run

View File

@ -1,4 +1,4 @@
.PHONY: clean local test linux macos docker push scm-source.json
.PHONY: clean local test linux macos docker push scm-source.json e2e-run e2e-tools e2e-build
BINARY ?= postgres-operator
BUILD_FLAGS ?= -v
@ -34,7 +34,7 @@ ifdef CDP_PULL_REQUEST_NUMBER
CDP_TAG := -${CDP_BUILD_VERSION}
endif
KIND_PATH := $(GOPATH)/bin
PATH := $(GOPATH)/bin:$(PATH)
SHELL := env PATH=$(PATH) $(SHELL)
@ -91,3 +91,16 @@ deps:
test:
hack/verify-codegen.sh
@go test ./...
e2e-build:
docker build --tag="postgres-operator-e2e-tests" -f e2e/Dockerfile .
e2e-tools:
# install pinned version of 'kind'
# leave the name as is to avoid overwriting official binary named `kind`
wget https://github.com/kubernetes-sigs/kind/releases/download/v0.3.0/kind-linux-amd64
chmod +x kind-linux-amd64
mv kind-linux-amd64 $(KIND_PATH)
e2e-run: docker
e2e/run.sh

View File

@ -11,7 +11,7 @@ pipeline:
apt-get update
- desc: 'Install required build software'
cmd: |
apt-get install -y make git apt-transport-https ca-certificates curl build-essential
apt-get install -y make git apt-transport-https ca-certificates curl build-essential python3 python3-pip
- desc: 'Install go'
cmd: |
cd /tmp
@ -41,6 +41,10 @@ pipeline:
export PATH=$PATH:$HOME/go/bin
cd $OPERATOR_TOP_DIR/postgres-operator
go test ./...
- desc: 'Run e2e tests'
cmd: |
cd $OPERATOR_TOP_DIR/postgres-operator
make e2e-tools e2e-build e2e-run
- desc: 'Push docker image'
cmd: |
export PATH=$PATH:$HOME/go/bin

View File

@ -322,6 +322,16 @@ Then you can for example check the Patroni logs:
kubectl logs acid-minimal-cluster-0
```
## End-to-end tests
The operator provides reference e2e (end-to-end) tests to ensure various infra parts work smoothly together.
Each e2e execution tests a Postgres operator image built from the current git branch. The test runner starts a [kind](https://kind.sigs.k8s.io/) (local k8s) cluster and Docker container with tests. The k8s API client from within the container connects to the `kind` cluster using the standard Docker `bridge` network.
The tests utilize examples from `/manifests` (ConfigMap is used for the operator configuration) to avoid maintaining yet another set of configuration files. The kind cluster is deleted if tests complete successfully.
End-to-end tests are executed automatically during builds; to invoke them locally use `make e2e-run` from the project's top directory. Run `make e2e-tools e2e-build` to install `kind` and build the tests' image locally before the first run.
End-to-end tests are written in Python and use `flake8` for code quality. Please run flake8 [before submitting a PR](http://flake8.pycqa.org/en/latest/user/using-hooks.html).
## Introduce additional configuration parameters
In the case you want to add functionality to the operator that shall be

22
e2e/Dockerfile Normal file
View File

@ -0,0 +1,22 @@
FROM ubuntu:18.04
LABEL maintainer="Team ACID @ Zalando <team-acid@zalando.de>"
WORKDIR /e2e
COPY manifests ./manifests
COPY e2e/requirements.txt e2e/tests ./
RUN apt-get update \
&& apt-get install --no-install-recommends -y \
python3 \
python3-setuptools \
python3-pip \
curl \
&& pip3 install --no-cache-dir -r requirements.txt \
&& curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.14.0/bin/linux/amd64/kubectl \
&& chmod +x ./kubectl \
&& mv ./kubectl /usr/local/bin/kubectl \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
CMD ["python3", "-m", "unittest", "discover", "--start-directory", ".", "-v"]

View File

@ -0,0 +1,6 @@
kind: Cluster
apiVersion: kind.sigs.k8s.io/v1alpha3
nodes:
- role: control-plane
- role: worker
- role: worker

3
e2e/requirements.txt Normal file
View File

@ -0,0 +1,3 @@
kubernetes==9.0.0
timeout_decorator==0.4.1
pyyaml==5.1

58
e2e/run.sh Executable file
View File

@ -0,0 +1,58 @@
#!/usr/bin/env bash
# enable unofficial bash strict mode
set -o errexit
set -o nounset
set -o pipefail
IFS=$'\n\t'
readonly cluster_name="postgres-operator-e2e-tests"
readonly operator_image=$(docker images --filter=reference="registry.opensource.zalan.do/acid/postgres-operator" --format "{{.Repository}}:{{.Tag}}" | head -1)
readonly e2e_test_image=${cluster_name}
readonly kubeconfig_path="/tmp/kind-config-${cluster_name}"
function start_kind(){
# avoid interference with previous test runs
if [[ $(kind-linux-amd64 get clusters | grep "^${cluster_name}*") != "" ]]
then
kind-linux-amd64 delete cluster --name ${cluster_name}
fi
kind-linux-amd64 create cluster --name ${cluster_name} --config ./e2e/kind-cluster-postgres-operator-e2e-tests.yaml
kind-linux-amd64 load docker-image "${operator_image}" --name ${cluster_name}
KUBECONFIG="$(kind-linux-amd64 get kubeconfig-path --name=${cluster_name})"
export KUBECONFIG
}
function set_kind_api_server_ip(){
# use the actual kubeconfig to connect to the 'kind' API server
# but update the IP address of the API server to the one from the Docker 'bridge' network
cp "${KUBECONFIG}" /tmp
readonly local kind_api_server_port=6443 # well-known in the 'kind' codebase
readonly local kind_api_server=$(docker inspect --format "{{ .NetworkSettings.IPAddress }}:${kind_api_server_port}" "${cluster_name}"-control-plane)
sed -i "s/server.*$/server: https:\/\/$kind_api_server/g" "${kubeconfig_path}"
}
function run_tests(){
docker run --rm --mount type=bind,source="$(readlink -f ${kubeconfig_path})",target=/root/.kube/config -e OPERATOR_IMAGE="${operator_image}" "${e2e_test_image}"
}
function clean_up(){
unset KUBECONFIG
kind-linux-amd64 delete cluster --name ${cluster_name}
rm -rf ${kubeconfig_path}
}
function main(){
trap "clean_up" QUIT TERM EXIT
start_kind
set_kind_api_server_ip
run_tests
exit 0
}
main "$@"

327
e2e/tests/test_e2e.py Normal file
View File

@ -0,0 +1,327 @@
import unittest
import time
import timeout_decorator
import subprocess
import warnings
import os
import yaml
from kubernetes import client, config
class EndToEndTestCase(unittest.TestCase):
'''
Test interaction of the operator with multiple k8s components.
'''
# `kind` pods may stuck in the `Terminating` phase for a few minutes; hence high test timeout
TEST_TIMEOUT_SEC = 600
@classmethod
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def setUpClass(cls):
'''
Deploy operator to a "kind" cluster created by /e2e/run.sh using examples from /manifests.
This operator deployment is to be shared among all tests.
/e2e/run.sh deletes the 'kind' cluster after successful run along with all operator-related entities.
In the case of test failure the cluster will stay to enable manual examination;
next invocation of "make e2e-run" will re-create it.
'''
# set a single k8s wrapper for all tests
k8s = cls.k8s = K8s()
# operator deploys pod service account there on start up
# needed for test_multi_namespace_support()
cls.namespace = "test"
v1_namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=cls.namespace))
k8s.api.core_v1.create_namespace(v1_namespace)
# submit the most recent operator image built on the Docker host
with open("manifests/postgres-operator.yaml", 'r+') as f:
operator_deployment = yaml.safe_load(f)
operator_deployment["spec"]["template"]["spec"]["containers"][0]["image"] = os.environ['OPERATOR_IMAGE']
yaml.dump(operator_deployment, f, Dumper=yaml.Dumper)
for filename in ["operator-service-account-rbac.yaml",
"configmap.yaml",
"postgres-operator.yaml"]:
k8s.create_with_kubectl("manifests/" + filename)
k8s.wait_for_operator_pod_start()
actual_operator_image = k8s.api.core_v1.list_namespaced_pod(
'default', label_selector='name=postgres-operator').items[0].spec.containers[0].image
print("Tested operator image: {}".format(actual_operator_image)) # shows up after tests finish
k8s.create_with_kubectl("manifests/minimal-postgres-manifest.yaml")
k8s.wait_for_pod_start('spilo-role=master')
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_multi_namespace_support(self):
'''
Create a customized Postgres cluster in a non-default namespace.
'''
k8s = self.k8s
with open("manifests/complete-postgres-manifest.yaml", 'r+') as f:
pg_manifest = yaml.safe_load(f)
pg_manifest["metadata"]["namespace"] = self.namespace
yaml.dump(pg_manifest, f, Dumper=yaml.Dumper)
k8s.create_with_kubectl("manifests/complete-postgres-manifest.yaml")
k8s.wait_for_pod_start("spilo-role=master", self.namespace)
self.assert_master_is_unique(self.namespace, version="acid-test-cluster")
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_scaling(self):
"""
Scale up from 2 to 3 and back to 2 pods by updating the Postgres manifest at runtime.
"""
k8s = self.k8s
labels = "version=acid-minimal-cluster"
k8s.wait_for_pg_to_scale(3)
self.assertEqual(3, k8s.count_pods_with_label(labels))
self.assert_master_is_unique()
k8s.wait_for_pg_to_scale(2)
self.assertEqual(2, k8s.count_pods_with_label(labels))
self.assert_master_is_unique()
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_taint_based_eviction(self):
"""
Add taint "postgres=:NoExecute" to node with master. This must cause a failover.
"""
k8s = self.k8s
cluster_label = 'version=acid-minimal-cluster'
# get nodes of master and replica(s) (expected target of new master)
current_master_node, failover_targets = k8s.get_pg_nodes(cluster_label)
num_replicas = len(failover_targets)
# if all pods live on the same node, failover will happen to other worker(s)
failover_targets = [x for x in failover_targets if x != current_master_node]
if len(failover_targets) == 0:
nodes = k8s.api.core_v1.list_node()
for n in nodes.items:
if "node-role.kubernetes.io/master" not in n.metadata.labels and n.metadata.name != current_master_node:
failover_targets.append(n.metadata.name)
# taint node with postgres=:NoExecute to force failover
body = {
"spec": {
"taints": [
{
"effect": "NoExecute",
"key": "postgres"
}
]
}
}
# patch node and test if master is failing over to one of the expected nodes
k8s.api.core_v1.patch_node(current_master_node, body)
k8s.wait_for_master_failover(failover_targets)
k8s.wait_for_pod_start('spilo-role=replica')
new_master_node, new_replica_nodes = k8s.get_pg_nodes(cluster_label)
self.assertNotEqual(current_master_node, new_master_node,
"Master on {} did not fail over to one of {}".format(current_master_node, failover_targets))
self.assertEqual(num_replicas, len(new_replica_nodes),
"Expected {} replicas, found {}".format(num_replicas, len(new_replica_nodes)))
self.assert_master_is_unique()
# undo the tainting
body = {
"spec": {
"taints": []
}
}
k8s.api.core_v1.patch_node(new_master_node, body)
@timeout_decorator.timeout(TEST_TIMEOUT_SEC)
def test_logical_backup_cron_job(self):
"""
Ensure we can (a) create the cron job at user request for a specific PG cluster
(b) update the cluster-wide image for the logical backup pod
(c) delete the job at user request
Limitations:
(a) Does not run the actual batch job because there is no S3 mock to upload backups to
(b) Assumes 'acid-minimal-cluster' exists as defined in setUp
"""
k8s = self.k8s
# create the cron job
schedule = "7 7 7 7 *"
pg_patch_enable_backup = {
"spec": {
"enableLogicalBackup": True,
"logicalBackupSchedule": schedule
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_enable_backup)
k8s.wait_for_logical_backup_job_creation()
jobs = k8s.get_logical_backup_job().items
self.assertEqual(1, len(jobs), "Expected 1 logical backup job, found {}".format(len(jobs)))
job = jobs[0]
self.assertEqual(job.metadata.name, "logical-backup-acid-minimal-cluster",
"Expected job name {}, found {}"
.format("logical-backup-acid-minimal-cluster", job.metadata.name))
self.assertEqual(job.spec.schedule, schedule,
"Expected {} schedule, found {}"
.format(schedule, job.spec.schedule))
# update the cluster-wide image of the logical backup pod
image = "test-image-name"
config_map_patch = {
"data": {
"logical_backup_docker_image": image,
}
}
k8s.api.core_v1.patch_namespaced_config_map("postgres-operator", "default", config_map_patch)
operator_pod = k8s.api.core_v1.list_namespaced_pod(
'default', label_selector="name=postgres-operator").items[0].metadata.name
k8s.api.core_v1.delete_namespaced_pod(operator_pod, "default") # restart reloads the conf
k8s.wait_for_operator_pod_start()
jobs = k8s.get_logical_backup_job().items
actual_image = jobs[0].spec.job_template.spec.template.spec.containers[0].image
self.assertEqual(actual_image, image,
"Expected job image {}, found {}".format(image, actual_image))
# delete the logical backup cron job
pg_patch_disable_backup = {
"spec": {
"enableLogicalBackup": False,
}
}
k8s.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", "default", "postgresqls", "acid-minimal-cluster", pg_patch_disable_backup)
k8s.wait_for_logical_backup_job_deletion()
jobs = k8s.get_logical_backup_job().items
self.assertEqual(0, len(jobs),
"Expected 0 logical backup jobs, found {}".format(len(jobs)))
def assert_master_is_unique(self, namespace='default', version="acid-minimal-cluster"):
"""
Check that there is a single pod in the k8s cluster with the label "spilo-role=master"
To be called manually after operations that affect pods
"""
k8s = self.k8s
labels = 'spilo-role=master,version=' + version
num_of_master_pods = k8s.count_pods_with_label(labels, namespace)
self.assertEqual(num_of_master_pods, 1, "Expected 1 master pod, found {}".format(num_of_master_pods))
class K8sApi:
def __init__(self):
# https://github.com/kubernetes-client/python/issues/309
warnings.simplefilter("ignore", ResourceWarning)
self.config = config.load_kube_config()
self.k8s_client = client.ApiClient()
self.core_v1 = client.CoreV1Api()
self.apps_v1 = client.AppsV1Api()
self.batch_v1_beta1 = client.BatchV1beta1Api()
self.custom_objects_api = client.CustomObjectsApi()
class K8s:
'''
Wraps around K8 api client and helper methods.
'''
RETRY_TIMEOUT_SEC = 5
def __init__(self):
self.api = K8sApi()
def get_pg_nodes(self, pg_cluster_name, namespace='default'):
master_pod_node = ''
replica_pod_nodes = []
podsList = self.api.core_v1.list_namespaced_pod(namespace, label_selector=pg_cluster_name)
for pod in podsList.items:
if pod.metadata.labels.get('spilo-role') == 'master':
master_pod_node = pod.spec.node_name
elif pod.metadata.labels.get('spilo-role') == 'replica':
replica_pod_nodes.append(pod.spec.node_name)
return master_pod_node, replica_pod_nodes
def wait_for_operator_pod_start(self):
self. wait_for_pod_start("name=postgres-operator")
# HACK operator must register CRD / add existing PG clusters after pod start up
# for local execution ~ 10 seconds suffices
time.sleep(60)
def wait_for_pod_start(self, pod_labels, namespace='default'):
pod_phase = 'No pod running'
while pod_phase != 'Running':
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=pod_labels).items
if pods:
pod_phase = pods[0].status.phase
time.sleep(self.RETRY_TIMEOUT_SEC)
def wait_for_pg_to_scale(self, number_of_instances, namespace='default'):
body = {
"spec": {
"numberOfInstances": number_of_instances
}
}
_ = self.api.custom_objects_api.patch_namespaced_custom_object(
"acid.zalan.do", "v1", namespace, "postgresqls", "acid-minimal-cluster", body)
labels = 'version=acid-minimal-cluster'
while self.count_pods_with_label(labels) != number_of_instances:
time.sleep(self.RETRY_TIMEOUT_SEC)
def count_pods_with_label(self, labels, namespace='default'):
return len(self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items)
def wait_for_master_failover(self, expected_master_nodes, namespace='default'):
pod_phase = 'Failing over'
new_master_node = ''
labels = 'spilo-role=master,version=acid-minimal-cluster'
while (pod_phase != 'Running') or (new_master_node not in expected_master_nodes):
pods = self.api.core_v1.list_namespaced_pod(namespace, label_selector=labels).items
if pods:
new_master_node = pods[0].spec.node_name
pod_phase = pods[0].status.phase
time.sleep(self.RETRY_TIMEOUT_SEC)
def get_logical_backup_job(self, namespace='default'):
return self.api.batch_v1_beta1.list_namespaced_cron_job(namespace, label_selector="application=spilo")
def wait_for_logical_backup_job(self, expected_num_of_jobs):
while (len(self.get_logical_backup_job().items) != expected_num_of_jobs):
time.sleep(self.RETRY_TIMEOUT_SEC)
def wait_for_logical_backup_job_deletion(self):
self.wait_for_logical_backup_job(expected_num_of_jobs=0)
def wait_for_logical_backup_job_creation(self):
self.wait_for_logical_backup_job(expected_num_of_jobs=1)
def create_with_kubectl(self, path):
subprocess.run(["kubectl", "create", "-f", path])
if __name__ == '__main__':
unittest.main()