improved logic for the various race conditions from FreeNAS api

This commit is contained in:
Travis Glenn Hansen 2020-12-02 18:37:31 -07:00
parent 01113c8270
commit 80abc76f66
7 changed files with 683 additions and 653 deletions

View File

@ -59,8 +59,10 @@ ENV PATH=/usr/local/lib/nodejs/bin:$PATH
COPY --from=build /usr/local/lib/nodejs /usr/local/lib/nodejs
# node service requirements
# netbase is required by rpcbind/rpcinfo to work properly
# /etc/{services,rpc} are required
RUN apt-get update && \
apt-get install -y e2fsprogs xfsprogs fatresize dosfstools nfs-common cifs-utils sudo && \
apt-get install -y netbase e2fsprogs xfsprogs fatresize dosfstools nfs-common cifs-utils sudo && \
rm -rf /var/lib/apt/lists/*
# controller requirements
@ -75,6 +77,14 @@ RUN chmod +x /usr/local/sbin/iscsiadm
ADD docker/multipath /usr/local/sbin
RUN chmod +x /usr/local/sbin/multipath
## USE_HOST_MOUNT_TOOLS=1
ADD docker/mount /usr/local/bin/mount
RUN chmod +x /usr/local/bin/mount
## USE_HOST_MOUNT_TOOLS=1
ADD docker/umount /usr/local/bin/umount
RUN chmod +x /usr/local/bin/umount
# Run as a non-root user
RUN useradd --create-home csi \
&& chown -R csi: /home/csi

View File

@ -0,0 +1,19 @@
#!/bin/bash
# under certain circumstances high concurrency requests to the FreeNAS/TrueNAS
# API can result in an invalid /etc/ctl.conf written to disk
# this script attempts to mitigate those failures by forcing a rebuild of the
# file using info strictly from the sqlite DB
# can test with this
# logger -t ctld "error in configuration file"
while [ 1 ]; do
egrep -m 1 "ctld.*error in configuration file" <(tail -n 0 -F /var/log/messages) &>/dev/null
echo "regen ctld config"
midclt call etc.generate ctld &>/dev/null
echo "reload ctld service"
/etc/rc.d/ctld reload &>/dev/null
done

View File

@ -0,0 +1,16 @@
#!/bin/bash
# watch the ctld pid file and ensure the service is actually running
while [ 1 ]; do
sleep 5
ps -p $(cat /var/run/ctld.pid) | grep ctld &>/dev/null || {
echo "ctld not running, restarting"
echo "regen ctld config"
midclt call etc.generate ctld &>/dev/null
echo "restart ctld service"
/etc/rc.d/ctld restart &>/dev/null
}
done

7
docker/mount Normal file
View File

@ -0,0 +1,7 @@
#!/bin/bash
if [[ ${USE_HOST_MOUNT_TOOLS} -eq 1 ]];then
chroot /host /usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" mount "${@:1}"
else
/usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" mount "${@:1}"
fi

7
docker/umount Normal file
View File

@ -0,0 +1,7 @@
#!/bin/bash
if [[ ${USE_HOST_MOUNT_TOOLS} -eq 1 ]];then
chroot /host /usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" umount "${@:1}"
else
/usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" umount "${@:1}"
fi

View File

@ -1,7 +1,6 @@
const { ControllerZfsSshBaseDriver } = require("../controller-zfs-ssh");
const { GrpcError, grpc } = require("../../utils/grpc");
const HttpClient = require("./http").Client;
const sleep = require("../../utils/general").sleep;
const Handlebars = require("handlebars");
@ -170,6 +169,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
const apiVersion = httpClient.getApiVersion();
const zb = await this.getZetabyte();
let volume_context;
let properties;
let endpoint;
let response;
@ -312,29 +312,27 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
);
}
}
let volume_context = {
node_attach_driver: "nfs",
server: this.options.nfs.shareHost,
share: properties.mountpoint.value,
};
return volume_context;
break;
default:
throw new GrpcError(
grpc.status.FAILED_PRECONDITION,
`invalid configuration: unknown apiVersion ${apiVersion}`
);
}
} else {
let volume_context = {
}
volume_context = {
node_attach_driver: "nfs",
server: this.options.nfs.shareHost,
share: properties.mountpoint.value,
};
return volume_context;
}
break;
/**
* TODO: smb need to be more defensive like iscsi and nfs
* ensuring the path is valid and the shareName
*/
case "smb":
properties = await zb.zfs.get(datasetName, [
"mountpoint",
@ -460,6 +458,38 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
* v2 = 200
*/
if ([200, 201].includes(response.statusCode)) {
share = response.body;
let sharePath;
let shareName;
switch (apiVersion) {
case 1:
sharePath = response.body.cifs_path;
shareName = response.body.cifs_name;
break;
case 2:
sharePath = response.body.path;
shareName = response.body.name;
break;
}
if (shareName != smbName) {
throw new GrpcError(
grpc.status.UNKNOWN,
`FreeNAS responded with incorrect share data: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
if (sharePath != properties.mountpoint.value) {
throw new GrpcError(
grpc.status.UNKNOWN,
`FreeNAS responded with incorrect share data: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
//set zfs property
await zb.zfs.set(datasetName, {
[FREENAS_SMB_SHARE_PROPERTY_NAME]: response.body.id,
@ -472,11 +502,39 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
if (
[409, 422].includes(response.statusCode) &&
JSON.stringify(response.body).includes(
"You can't share same filesystem with all hosts twice."
"A share with this name already exists."
)
) {
// move along
// TODO: need to set the shareId here for sure
let lookupShare = await this.findResourceByProperties(
endpoint,
(item) => {
if (
(item.cifs_path &&
item.cifs_path == properties.mountpoint.value &&
item.cifs_name &&
item.cifs_name == smbName) ||
(item.path &&
item.path == properties.mountpoint.value &&
item.name &&
item.name == smbName)
) {
return true;
}
return false;
}
);
if (!lookupShare) {
throw new GrpcError(
grpc.status.UNKNOWN,
`FreeNAS failed to find matching share`
);
}
//set zfs property
await zb.zfs.set(datasetName, {
[FREENAS_SMB_SHARE_PROPERTY_NAME]: lookupShare.id,
});
} else {
throw new GrpcError(
grpc.status.UNKNOWN,
@ -486,28 +544,22 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
);
}
}
let volume_context = {
node_attach_driver: "smb",
server: this.options.smb.shareHost,
share: smbName,
};
return volume_context;
break;
default:
throw new GrpcError(
grpc.status.FAILED_PRECONDITION,
`invalid configuration: unknown apiVersion ${apiVersion}`
);
}
} else {
let volume_context = {
}
volume_context = {
node_attach_driver: "smb",
server: this.options.smb.shareHost,
share: smbName,
};
return volume_context;
}
break;
case "iscsi":
properties = await zb.zfs.get(datasetName, [
@ -599,7 +651,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
}
switch (apiVersion) {
case 1: {
case 1:
response = await httpClient.get(
"/services/iscsi/globalconfiguration"
);
@ -613,14 +665,46 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
}
basename = response.body.iscsi_basename;
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
break;
case 2:
response = await httpClient.get("/iscsi/global");
if (response.statusCode != 200) {
throw new GrpcError(
grpc.status.UNKNOWN,
`error getting iscsi configuration - code: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
basename = response.body.basename;
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
break;
default:
throw new GrpcError(
grpc.status.FAILED_PRECONDITION,
`invalid configuration: unknown apiVersion ${apiVersion}`
);
}
// if we got all the way to the TARGETTOEXTENT then we fully finished
// otherwise we must do all assets every time due to the interdependence of IDs etc
if (
!zb.helpers.isPropertyValueSet(
properties[FREENAS_ISCSI_TARGETTOEXTENT_ID_PROPERTY_NAME].value
)
) {
switch (apiVersion) {
case 1: {
// create target
let target = {
iscsi_target_name: iscsiName,
iscsi_target_alias: "", // TODO: allow template for this
};
response = await httpClient.post("/services/iscsi/target", target);
response = await httpClient.post(
"/services/iscsi/target",
target
);
// 409 if invalid
if (response.statusCode != 201) {
@ -675,7 +759,8 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
for (let targetGroupConfig of this.options.iscsi.targetGroups) {
let targetGroup = {
iscsi_target: target.id,
iscsi_target_authgroup: targetGroupConfig.targetGroupAuthGroup,
iscsi_target_authgroup:
targetGroupConfig.targetGroupAuthGroup,
iscsi_target_authtype: targetGroupConfig.targetGroupAuthType
? targetGroupConfig.targetGroupAuthType
: "None",
@ -766,7 +851,10 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
: Number(extentRpm),
iscsi_target_extent_ro: false,
};
response = await httpClient.post("/services/iscsi/extent", extent);
response = await httpClient.post(
"/services/iscsi/extent",
extent
);
// 409 if invalid
if (response.statusCode != 201) {
@ -878,18 +966,6 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
break;
}
case 2:
response = await httpClient.get("/iscsi/global");
if (response.statusCode != 200) {
throw new GrpcError(
grpc.status.UNKNOWN,
`error getting iscsi configuration - code: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
basename = response.body.basename;
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
// create target and targetgroup
//let targetId;
let targetGroups = [];
@ -927,9 +1003,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
"Target name already exists"
)
) {
target = await this.findResourceByProperties("/iscsi/target", {
target = await this.findResourceByProperties(
"/iscsi/target",
{
name: iscsiName,
});
}
);
} else {
throw new GrpcError(
grpc.status.UNKNOWN,
@ -961,9 +1040,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
// TODO: this logic could be more intelligent but this should do for now as it appears in the failure scenario no groups are added
// in other words, I have never seen them invalid, only omitted so this should be enough
if (target.groups.length != targetGroups.length) {
response = await httpClient.put(`/iscsi/target/id/${target.id}`, {
response = await httpClient.put(
`/iscsi/target/id/${target.id}`,
{
groups: targetGroups,
});
}
);
if (response.statusCode != 200) {
throw new GrpcError(
@ -1030,9 +1112,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
"Extent name must be unique"
)
) {
extent = await this.findResourceByProperties("/iscsi/extent", {
extent = await this.findResourceByProperties(
"/iscsi/extent",
{
name: iscsiName,
});
}
);
} else {
throw new GrpcError(
grpc.status.UNKNOWN,
@ -1133,6 +1218,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
`invalid configuration: unknown apiVersion ${apiVersion}`
);
}
}
// iqn = target
let iqn = basename + ":" + iscsiName;
@ -1157,7 +1243,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
// iqn
// lun
let volume_context = {
volume_context = {
node_attach_driver: "iscsi",
portal: this.options.iscsi.targetPortal,
portals: this.options.iscsi.targetPortals.join(","),
@ -1566,103 +1652,6 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
}
}
async failedAttachHelper(call, err) {
const driverShareType = this.getDriverShareType();
const sshClient = this.getSshClient();
let response;
// not fully implemented
return;
switch (driverShareType) {
case "iscsi":
const isScale = await this.getIsScale();
const majorMinor = await this.getSystemVersionMajorMinor();
// only works for BSD-based and 11.3+
if (!isScale && majorMinor >= 11.3) {
const sudoEnabled = this.getSudoEnabled();
const sudoPath = await this.getSudoPath();
let command;
//19 - encountered non-retryable iSCSI login failure
// ^ could be missing groups on the target
//cat /var/run/ctld.pid
// ps -p <pid> | grep ctld
// ps -p `cat /var/run/ctld.pid` | grep ctld (if 0 exit status it's running, otherwise no)
// random settle time
// this could be getting invoked by other instances of the same controller
// or other deployments of controllers in the same of different clusters
// altogether
let maxSettleTime = 10000;
let settleTime = Math.floor(Math.random() * maxSettleTime + 1);
await sleep(settleTime);
// test if config is bad
// if so regen
command = sshClient.buildCommand("/usr/sbin/ctld", ["-d"]);
if (sudoEnabled) {
command = sudoPath + " " + command;
}
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
response = await sshClient.exec(command);
let configError = false;
let serviceRunning = false;
if (response.stderr.includes("configuration error")) {
configError = true;
}
// NOTE: this will not be in the output if the config file has an error
if (response.stderr.includes("daemon already running")) {
serviceRunning = true;
}
if (configError) {
this.ctx.logger.warn(
"FailedAttachHelper: ctld appears to have a bad configuration file, attempting to regenerate"
);
// regen config
// midclt call etc.generate ctld
command = sshClient.buildCommand("midclt", [
"call",
"etc.generate",
"ctld",
]);
if (sudoEnabled) {
command = sudoPath + " " + command;
}
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
response = await sshClient.exec(command);
// reload service (may not be enough)
command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
if (sudoEnabled) {
command = sudoPath + " " + command;
}
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
response = await sshClient.exec(command);
}
// note, when the 'bad' state is entered, the status still shows as running
// check if service is running
// /etc/rc.d/ctld status ...exits 0 if running
//command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
// if service is not running attempt a restart
// /etc/rc.d/ctld restart
//command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
}
break;
}
}
async getApiVersion() {
const systemVersion = await this.getSystemVersion();

View File

@ -356,15 +356,7 @@ class CsiBaseDriver {
nodeDB
);
// login
try {
await iscsi.iscsiadm.login(volume_context.iqn, portal);
} catch (err) {
if (typeof this.failedAttachHelper === "function") {
// no need to await this
this.failedAttachHelper(call, err);
}
throw err;
}
// find device name
device = `/dev/disk/by-path/ip-${portal}-iscsi-${volume_context.iqn}-lun-${volume_context.lun}`;
@ -386,16 +378,6 @@ class CsiBaseDriver {
let current_time = Math.round(new Date().getTime() / 1000);
if (!result && current_time - timer_start > timer_max) {
if (typeof this.failedAttachHelper === "function") {
// no need to await this
this.failedAttachHelper(
call,
new Error(
`hit timeout waiting for device node to appear: ${device}`
)
);
}
driver.ctx.logger.warn(
`hit timeout waiting for device node to appear: ${device}`
);