improved logic for the various race conditions from FreeNAS api

This commit is contained in:
Travis Glenn Hansen 2020-12-02 18:37:31 -07:00
parent 01113c8270
commit 80abc76f66
7 changed files with 683 additions and 653 deletions

View File

@ -59,8 +59,10 @@ ENV PATH=/usr/local/lib/nodejs/bin:$PATH
COPY --from=build /usr/local/lib/nodejs /usr/local/lib/nodejs COPY --from=build /usr/local/lib/nodejs /usr/local/lib/nodejs
# node service requirements # node service requirements
# netbase is required by rpcbind/rpcinfo to work properly
# /etc/{services,rpc} are required
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y e2fsprogs xfsprogs fatresize dosfstools nfs-common cifs-utils sudo && \ apt-get install -y netbase e2fsprogs xfsprogs fatresize dosfstools nfs-common cifs-utils sudo && \
rm -rf /var/lib/apt/lists/* rm -rf /var/lib/apt/lists/*
# controller requirements # controller requirements
@ -75,6 +77,14 @@ RUN chmod +x /usr/local/sbin/iscsiadm
ADD docker/multipath /usr/local/sbin ADD docker/multipath /usr/local/sbin
RUN chmod +x /usr/local/sbin/multipath RUN chmod +x /usr/local/sbin/multipath
## USE_HOST_MOUNT_TOOLS=1
ADD docker/mount /usr/local/bin/mount
RUN chmod +x /usr/local/bin/mount
## USE_HOST_MOUNT_TOOLS=1
ADD docker/umount /usr/local/bin/umount
RUN chmod +x /usr/local/bin/umount
# Run as a non-root user # Run as a non-root user
RUN useradd --create-home csi \ RUN useradd --create-home csi \
&& chown -R csi: /home/csi && chown -R csi: /home/csi

View File

@ -0,0 +1,19 @@
#!/bin/bash
# under certain circumstances high concurrency requests to the FreeNAS/TrueNAS
# API can result in an invalid /etc/ctl.conf written to disk
# this script attempts to mitigate those failures by forcing a rebuild of the
# file using info strictly from the sqlite DB
# can test with this
# logger -t ctld "error in configuration file"
while [ 1 ]; do
egrep -m 1 "ctld.*error in configuration file" <(tail -n 0 -F /var/log/messages) &>/dev/null
echo "regen ctld config"
midclt call etc.generate ctld &>/dev/null
echo "reload ctld service"
/etc/rc.d/ctld reload &>/dev/null
done

View File

@ -0,0 +1,16 @@
#!/bin/bash
# watch the ctld pid file and ensure the service is actually running
while [ 1 ]; do
sleep 5
ps -p $(cat /var/run/ctld.pid) | grep ctld &>/dev/null || {
echo "ctld not running, restarting"
echo "regen ctld config"
midclt call etc.generate ctld &>/dev/null
echo "restart ctld service"
/etc/rc.d/ctld restart &>/dev/null
}
done

7
docker/mount Normal file
View File

@ -0,0 +1,7 @@
#!/bin/bash
if [[ ${USE_HOST_MOUNT_TOOLS} -eq 1 ]];then
chroot /host /usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" mount "${@:1}"
else
/usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" mount "${@:1}"
fi

7
docker/umount Normal file
View File

@ -0,0 +1,7 @@
#!/bin/bash
if [[ ${USE_HOST_MOUNT_TOOLS} -eq 1 ]];then
chroot /host /usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" umount "${@:1}"
else
/usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" umount "${@:1}"
fi

View File

@ -1,7 +1,6 @@
const { ControllerZfsSshBaseDriver } = require("../controller-zfs-ssh"); const { ControllerZfsSshBaseDriver } = require("../controller-zfs-ssh");
const { GrpcError, grpc } = require("../../utils/grpc"); const { GrpcError, grpc } = require("../../utils/grpc");
const HttpClient = require("./http").Client; const HttpClient = require("./http").Client;
const sleep = require("../../utils/general").sleep;
const Handlebars = require("handlebars"); const Handlebars = require("handlebars");
@ -170,6 +169,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
const apiVersion = httpClient.getApiVersion(); const apiVersion = httpClient.getApiVersion();
const zb = await this.getZetabyte(); const zb = await this.getZetabyte();
let volume_context;
let properties; let properties;
let endpoint; let endpoint;
let response; let response;
@ -312,29 +312,27 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
); );
} }
} }
break;
let volume_context = {
node_attach_driver: "nfs",
server: this.options.nfs.shareHost,
share: properties.mountpoint.value,
};
return volume_context;
default: default:
throw new GrpcError( throw new GrpcError(
grpc.status.FAILED_PRECONDITION, grpc.status.FAILED_PRECONDITION,
`invalid configuration: unknown apiVersion ${apiVersion}` `invalid configuration: unknown apiVersion ${apiVersion}`
); );
} }
} else { }
let volume_context = {
volume_context = {
node_attach_driver: "nfs", node_attach_driver: "nfs",
server: this.options.nfs.shareHost, server: this.options.nfs.shareHost,
share: properties.mountpoint.value, share: properties.mountpoint.value,
}; };
return volume_context; return volume_context;
}
break; break;
/**
* TODO: smb need to be more defensive like iscsi and nfs
* ensuring the path is valid and the shareName
*/
case "smb": case "smb":
properties = await zb.zfs.get(datasetName, [ properties = await zb.zfs.get(datasetName, [
"mountpoint", "mountpoint",
@ -460,6 +458,38 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
* v2 = 200 * v2 = 200
*/ */
if ([200, 201].includes(response.statusCode)) { if ([200, 201].includes(response.statusCode)) {
share = response.body;
let sharePath;
let shareName;
switch (apiVersion) {
case 1:
sharePath = response.body.cifs_path;
shareName = response.body.cifs_name;
break;
case 2:
sharePath = response.body.path;
shareName = response.body.name;
break;
}
if (shareName != smbName) {
throw new GrpcError(
grpc.status.UNKNOWN,
`FreeNAS responded with incorrect share data: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
if (sharePath != properties.mountpoint.value) {
throw new GrpcError(
grpc.status.UNKNOWN,
`FreeNAS responded with incorrect share data: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
//set zfs property //set zfs property
await zb.zfs.set(datasetName, { await zb.zfs.set(datasetName, {
[FREENAS_SMB_SHARE_PROPERTY_NAME]: response.body.id, [FREENAS_SMB_SHARE_PROPERTY_NAME]: response.body.id,
@ -472,11 +502,39 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
if ( if (
[409, 422].includes(response.statusCode) && [409, 422].includes(response.statusCode) &&
JSON.stringify(response.body).includes( JSON.stringify(response.body).includes(
"You can't share same filesystem with all hosts twice." "A share with this name already exists."
) )
) { ) {
// move along let lookupShare = await this.findResourceByProperties(
// TODO: need to set the shareId here for sure endpoint,
(item) => {
if (
(item.cifs_path &&
item.cifs_path == properties.mountpoint.value &&
item.cifs_name &&
item.cifs_name == smbName) ||
(item.path &&
item.path == properties.mountpoint.value &&
item.name &&
item.name == smbName)
) {
return true;
}
return false;
}
);
if (!lookupShare) {
throw new GrpcError(
grpc.status.UNKNOWN,
`FreeNAS failed to find matching share`
);
}
//set zfs property
await zb.zfs.set(datasetName, {
[FREENAS_SMB_SHARE_PROPERTY_NAME]: lookupShare.id,
});
} else { } else {
throw new GrpcError( throw new GrpcError(
grpc.status.UNKNOWN, grpc.status.UNKNOWN,
@ -486,28 +544,22 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
); );
} }
} }
break;
let volume_context = {
node_attach_driver: "smb",
server: this.options.smb.shareHost,
share: smbName,
};
return volume_context;
default: default:
throw new GrpcError( throw new GrpcError(
grpc.status.FAILED_PRECONDITION, grpc.status.FAILED_PRECONDITION,
`invalid configuration: unknown apiVersion ${apiVersion}` `invalid configuration: unknown apiVersion ${apiVersion}`
); );
} }
} else { }
let volume_context = {
volume_context = {
node_attach_driver: "smb", node_attach_driver: "smb",
server: this.options.smb.shareHost, server: this.options.smb.shareHost,
share: smbName, share: smbName,
}; };
return volume_context; return volume_context;
}
break; break;
case "iscsi": case "iscsi":
properties = await zb.zfs.get(datasetName, [ properties = await zb.zfs.get(datasetName, [
@ -599,7 +651,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
} }
switch (apiVersion) { switch (apiVersion) {
case 1: { case 1:
response = await httpClient.get( response = await httpClient.get(
"/services/iscsi/globalconfiguration" "/services/iscsi/globalconfiguration"
); );
@ -613,14 +665,46 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
} }
basename = response.body.iscsi_basename; basename = response.body.iscsi_basename;
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename); this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
break;
case 2:
response = await httpClient.get("/iscsi/global");
if (response.statusCode != 200) {
throw new GrpcError(
grpc.status.UNKNOWN,
`error getting iscsi configuration - code: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
basename = response.body.basename;
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
break;
default:
throw new GrpcError(
grpc.status.FAILED_PRECONDITION,
`invalid configuration: unknown apiVersion ${apiVersion}`
);
}
// if we got all the way to the TARGETTOEXTENT then we fully finished
// otherwise we must do all assets every time due to the interdependence of IDs etc
if (
!zb.helpers.isPropertyValueSet(
properties[FREENAS_ISCSI_TARGETTOEXTENT_ID_PROPERTY_NAME].value
)
) {
switch (apiVersion) {
case 1: {
// create target // create target
let target = { let target = {
iscsi_target_name: iscsiName, iscsi_target_name: iscsiName,
iscsi_target_alias: "", // TODO: allow template for this iscsi_target_alias: "", // TODO: allow template for this
}; };
response = await httpClient.post("/services/iscsi/target", target); response = await httpClient.post(
"/services/iscsi/target",
target
);
// 409 if invalid // 409 if invalid
if (response.statusCode != 201) { if (response.statusCode != 201) {
@ -675,7 +759,8 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
for (let targetGroupConfig of this.options.iscsi.targetGroups) { for (let targetGroupConfig of this.options.iscsi.targetGroups) {
let targetGroup = { let targetGroup = {
iscsi_target: target.id, iscsi_target: target.id,
iscsi_target_authgroup: targetGroupConfig.targetGroupAuthGroup, iscsi_target_authgroup:
targetGroupConfig.targetGroupAuthGroup,
iscsi_target_authtype: targetGroupConfig.targetGroupAuthType iscsi_target_authtype: targetGroupConfig.targetGroupAuthType
? targetGroupConfig.targetGroupAuthType ? targetGroupConfig.targetGroupAuthType
: "None", : "None",
@ -766,7 +851,10 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
: Number(extentRpm), : Number(extentRpm),
iscsi_target_extent_ro: false, iscsi_target_extent_ro: false,
}; };
response = await httpClient.post("/services/iscsi/extent", extent); response = await httpClient.post(
"/services/iscsi/extent",
extent
);
// 409 if invalid // 409 if invalid
if (response.statusCode != 201) { if (response.statusCode != 201) {
@ -878,18 +966,6 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
break; break;
} }
case 2: case 2:
response = await httpClient.get("/iscsi/global");
if (response.statusCode != 200) {
throw new GrpcError(
grpc.status.UNKNOWN,
`error getting iscsi configuration - code: ${
response.statusCode
} body: ${JSON.stringify(response.body)}`
);
}
basename = response.body.basename;
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
// create target and targetgroup // create target and targetgroup
//let targetId; //let targetId;
let targetGroups = []; let targetGroups = [];
@ -927,9 +1003,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
"Target name already exists" "Target name already exists"
) )
) { ) {
target = await this.findResourceByProperties("/iscsi/target", { target = await this.findResourceByProperties(
"/iscsi/target",
{
name: iscsiName, name: iscsiName,
}); }
);
} else { } else {
throw new GrpcError( throw new GrpcError(
grpc.status.UNKNOWN, grpc.status.UNKNOWN,
@ -961,9 +1040,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
// TODO: this logic could be more intelligent but this should do for now as it appears in the failure scenario no groups are added // TODO: this logic could be more intelligent but this should do for now as it appears in the failure scenario no groups are added
// in other words, I have never seen them invalid, only omitted so this should be enough // in other words, I have never seen them invalid, only omitted so this should be enough
if (target.groups.length != targetGroups.length) { if (target.groups.length != targetGroups.length) {
response = await httpClient.put(`/iscsi/target/id/${target.id}`, { response = await httpClient.put(
`/iscsi/target/id/${target.id}`,
{
groups: targetGroups, groups: targetGroups,
}); }
);
if (response.statusCode != 200) { if (response.statusCode != 200) {
throw new GrpcError( throw new GrpcError(
@ -1030,9 +1112,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
"Extent name must be unique" "Extent name must be unique"
) )
) { ) {
extent = await this.findResourceByProperties("/iscsi/extent", { extent = await this.findResourceByProperties(
"/iscsi/extent",
{
name: iscsiName, name: iscsiName,
}); }
);
} else { } else {
throw new GrpcError( throw new GrpcError(
grpc.status.UNKNOWN, grpc.status.UNKNOWN,
@ -1133,6 +1218,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
`invalid configuration: unknown apiVersion ${apiVersion}` `invalid configuration: unknown apiVersion ${apiVersion}`
); );
} }
}
// iqn = target // iqn = target
let iqn = basename + ":" + iscsiName; let iqn = basename + ":" + iscsiName;
@ -1157,7 +1243,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
// iqn // iqn
// lun // lun
let volume_context = { volume_context = {
node_attach_driver: "iscsi", node_attach_driver: "iscsi",
portal: this.options.iscsi.targetPortal, portal: this.options.iscsi.targetPortal,
portals: this.options.iscsi.targetPortals.join(","), portals: this.options.iscsi.targetPortals.join(","),
@ -1566,103 +1652,6 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
} }
} }
async failedAttachHelper(call, err) {
const driverShareType = this.getDriverShareType();
const sshClient = this.getSshClient();
let response;
// not fully implemented
return;
switch (driverShareType) {
case "iscsi":
const isScale = await this.getIsScale();
const majorMinor = await this.getSystemVersionMajorMinor();
// only works for BSD-based and 11.3+
if (!isScale && majorMinor >= 11.3) {
const sudoEnabled = this.getSudoEnabled();
const sudoPath = await this.getSudoPath();
let command;
//19 - encountered non-retryable iSCSI login failure
// ^ could be missing groups on the target
//cat /var/run/ctld.pid
// ps -p <pid> | grep ctld
// ps -p `cat /var/run/ctld.pid` | grep ctld (if 0 exit status it's running, otherwise no)
// random settle time
// this could be getting invoked by other instances of the same controller
// or other deployments of controllers in the same of different clusters
// altogether
let maxSettleTime = 10000;
let settleTime = Math.floor(Math.random() * maxSettleTime + 1);
await sleep(settleTime);
// test if config is bad
// if so regen
command = sshClient.buildCommand("/usr/sbin/ctld", ["-d"]);
if (sudoEnabled) {
command = sudoPath + " " + command;
}
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
response = await sshClient.exec(command);
let configError = false;
let serviceRunning = false;
if (response.stderr.includes("configuration error")) {
configError = true;
}
// NOTE: this will not be in the output if the config file has an error
if (response.stderr.includes("daemon already running")) {
serviceRunning = true;
}
if (configError) {
this.ctx.logger.warn(
"FailedAttachHelper: ctld appears to have a bad configuration file, attempting to regenerate"
);
// regen config
// midclt call etc.generate ctld
command = sshClient.buildCommand("midclt", [
"call",
"etc.generate",
"ctld",
]);
if (sudoEnabled) {
command = sudoPath + " " + command;
}
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
response = await sshClient.exec(command);
// reload service (may not be enough)
command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
if (sudoEnabled) {
command = sudoPath + " " + command;
}
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
response = await sshClient.exec(command);
}
// note, when the 'bad' state is entered, the status still shows as running
// check if service is running
// /etc/rc.d/ctld status ...exits 0 if running
//command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
// if service is not running attempt a restart
// /etc/rc.d/ctld restart
//command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
}
break;
}
}
async getApiVersion() { async getApiVersion() {
const systemVersion = await this.getSystemVersion(); const systemVersion = await this.getSystemVersion();

View File

@ -356,15 +356,7 @@ class CsiBaseDriver {
nodeDB nodeDB
); );
// login // login
try {
await iscsi.iscsiadm.login(volume_context.iqn, portal); await iscsi.iscsiadm.login(volume_context.iqn, portal);
} catch (err) {
if (typeof this.failedAttachHelper === "function") {
// no need to await this
this.failedAttachHelper(call, err);
}
throw err;
}
// find device name // find device name
device = `/dev/disk/by-path/ip-${portal}-iscsi-${volume_context.iqn}-lun-${volume_context.lun}`; device = `/dev/disk/by-path/ip-${portal}-iscsi-${volume_context.iqn}-lun-${volume_context.lun}`;
@ -386,16 +378,6 @@ class CsiBaseDriver {
let current_time = Math.round(new Date().getTime() / 1000); let current_time = Math.round(new Date().getTime() / 1000);
if (!result && current_time - timer_start > timer_max) { if (!result && current_time - timer_start > timer_max) {
if (typeof this.failedAttachHelper === "function") {
// no need to await this
this.failedAttachHelper(
call,
new Error(
`hit timeout waiting for device node to appear: ${device}`
)
);
}
driver.ctx.logger.warn( driver.ctx.logger.warn(
`hit timeout waiting for device node to appear: ${device}` `hit timeout waiting for device node to appear: ${device}`
); );