improved logic for the various race conditions from FreeNAS api
This commit is contained in:
parent
01113c8270
commit
80abc76f66
12
Dockerfile
12
Dockerfile
|
|
@ -59,8 +59,10 @@ ENV PATH=/usr/local/lib/nodejs/bin:$PATH
|
|||
COPY --from=build /usr/local/lib/nodejs /usr/local/lib/nodejs
|
||||
|
||||
# node service requirements
|
||||
# netbase is required by rpcbind/rpcinfo to work properly
|
||||
# /etc/{services,rpc} are required
|
||||
RUN apt-get update && \
|
||||
apt-get install -y e2fsprogs xfsprogs fatresize dosfstools nfs-common cifs-utils sudo && \
|
||||
apt-get install -y netbase e2fsprogs xfsprogs fatresize dosfstools nfs-common cifs-utils sudo && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# controller requirements
|
||||
|
|
@ -75,6 +77,14 @@ RUN chmod +x /usr/local/sbin/iscsiadm
|
|||
ADD docker/multipath /usr/local/sbin
|
||||
RUN chmod +x /usr/local/sbin/multipath
|
||||
|
||||
## USE_HOST_MOUNT_TOOLS=1
|
||||
ADD docker/mount /usr/local/bin/mount
|
||||
RUN chmod +x /usr/local/bin/mount
|
||||
|
||||
## USE_HOST_MOUNT_TOOLS=1
|
||||
ADD docker/umount /usr/local/bin/umount
|
||||
RUN chmod +x /usr/local/bin/umount
|
||||
|
||||
# Run as a non-root user
|
||||
RUN useradd --create-home csi \
|
||||
&& chown -R csi: /home/csi
|
||||
|
|
|
|||
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
# under certain circumstances high concurrency requests to the FreeNAS/TrueNAS
|
||||
# API can result in an invalid /etc/ctl.conf written to disk
|
||||
# this script attempts to mitigate those failures by forcing a rebuild of the
|
||||
# file using info strictly from the sqlite DB
|
||||
|
||||
# can test with this
|
||||
# logger -t ctld "error in configuration file"
|
||||
|
||||
while [ 1 ]; do
|
||||
egrep -m 1 "ctld.*error in configuration file" <(tail -n 0 -F /var/log/messages) &>/dev/null
|
||||
|
||||
echo "regen ctld config"
|
||||
midclt call etc.generate ctld &>/dev/null
|
||||
|
||||
echo "reload ctld service"
|
||||
/etc/rc.d/ctld reload &>/dev/null
|
||||
done
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
|
||||
# watch the ctld pid file and ensure the service is actually running
|
||||
|
||||
while [ 1 ]; do
|
||||
sleep 5
|
||||
ps -p $(cat /var/run/ctld.pid) | grep ctld &>/dev/null || {
|
||||
echo "ctld not running, restarting"
|
||||
|
||||
echo "regen ctld config"
|
||||
midclt call etc.generate ctld &>/dev/null
|
||||
|
||||
echo "restart ctld service"
|
||||
/etc/rc.d/ctld restart &>/dev/null
|
||||
}
|
||||
done
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [[ ${USE_HOST_MOUNT_TOOLS} -eq 1 ]];then
|
||||
chroot /host /usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" mount "${@:1}"
|
||||
else
|
||||
/usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" mount "${@:1}"
|
||||
fi
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [[ ${USE_HOST_MOUNT_TOOLS} -eq 1 ]];then
|
||||
chroot /host /usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" umount "${@:1}"
|
||||
else
|
||||
/usr/bin/env -i PATH="/sbin:/bin:/usr/bin:/usr/sbin" umount "${@:1}"
|
||||
fi
|
||||
|
|
@ -1,7 +1,6 @@
|
|||
const { ControllerZfsSshBaseDriver } = require("../controller-zfs-ssh");
|
||||
const { GrpcError, grpc } = require("../../utils/grpc");
|
||||
const HttpClient = require("./http").Client;
|
||||
const sleep = require("../../utils/general").sleep;
|
||||
|
||||
const Handlebars = require("handlebars");
|
||||
|
||||
|
|
@ -170,6 +169,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
const apiVersion = httpClient.getApiVersion();
|
||||
const zb = await this.getZetabyte();
|
||||
|
||||
let volume_context;
|
||||
let properties;
|
||||
let endpoint;
|
||||
let response;
|
||||
|
|
@ -312,29 +312,27 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
);
|
||||
}
|
||||
}
|
||||
|
||||
let volume_context = {
|
||||
node_attach_driver: "nfs",
|
||||
server: this.options.nfs.shareHost,
|
||||
share: properties.mountpoint.value,
|
||||
};
|
||||
return volume_context;
|
||||
|
||||
break;
|
||||
default:
|
||||
throw new GrpcError(
|
||||
grpc.status.FAILED_PRECONDITION,
|
||||
`invalid configuration: unknown apiVersion ${apiVersion}`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let volume_context = {
|
||||
}
|
||||
|
||||
volume_context = {
|
||||
node_attach_driver: "nfs",
|
||||
server: this.options.nfs.shareHost,
|
||||
share: properties.mountpoint.value,
|
||||
};
|
||||
return volume_context;
|
||||
}
|
||||
|
||||
break;
|
||||
/**
|
||||
* TODO: smb need to be more defensive like iscsi and nfs
|
||||
* ensuring the path is valid and the shareName
|
||||
*/
|
||||
case "smb":
|
||||
properties = await zb.zfs.get(datasetName, [
|
||||
"mountpoint",
|
||||
|
|
@ -460,6 +458,38 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
* v2 = 200
|
||||
*/
|
||||
if ([200, 201].includes(response.statusCode)) {
|
||||
share = response.body;
|
||||
let sharePath;
|
||||
let shareName;
|
||||
switch (apiVersion) {
|
||||
case 1:
|
||||
sharePath = response.body.cifs_path;
|
||||
shareName = response.body.cifs_name;
|
||||
break;
|
||||
case 2:
|
||||
sharePath = response.body.path;
|
||||
shareName = response.body.name;
|
||||
break;
|
||||
}
|
||||
|
||||
if (shareName != smbName) {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
`FreeNAS responded with incorrect share data: ${
|
||||
response.statusCode
|
||||
} body: ${JSON.stringify(response.body)}`
|
||||
);
|
||||
}
|
||||
|
||||
if (sharePath != properties.mountpoint.value) {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
`FreeNAS responded with incorrect share data: ${
|
||||
response.statusCode
|
||||
} body: ${JSON.stringify(response.body)}`
|
||||
);
|
||||
}
|
||||
|
||||
//set zfs property
|
||||
await zb.zfs.set(datasetName, {
|
||||
[FREENAS_SMB_SHARE_PROPERTY_NAME]: response.body.id,
|
||||
|
|
@ -472,11 +502,39 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
if (
|
||||
[409, 422].includes(response.statusCode) &&
|
||||
JSON.stringify(response.body).includes(
|
||||
"You can't share same filesystem with all hosts twice."
|
||||
"A share with this name already exists."
|
||||
)
|
||||
) {
|
||||
// move along
|
||||
// TODO: need to set the shareId here for sure
|
||||
let lookupShare = await this.findResourceByProperties(
|
||||
endpoint,
|
||||
(item) => {
|
||||
if (
|
||||
(item.cifs_path &&
|
||||
item.cifs_path == properties.mountpoint.value &&
|
||||
item.cifs_name &&
|
||||
item.cifs_name == smbName) ||
|
||||
(item.path &&
|
||||
item.path == properties.mountpoint.value &&
|
||||
item.name &&
|
||||
item.name == smbName)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
);
|
||||
|
||||
if (!lookupShare) {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
`FreeNAS failed to find matching share`
|
||||
);
|
||||
}
|
||||
|
||||
//set zfs property
|
||||
await zb.zfs.set(datasetName, {
|
||||
[FREENAS_SMB_SHARE_PROPERTY_NAME]: lookupShare.id,
|
||||
});
|
||||
} else {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
|
|
@ -486,28 +544,22 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
);
|
||||
}
|
||||
}
|
||||
|
||||
let volume_context = {
|
||||
node_attach_driver: "smb",
|
||||
server: this.options.smb.shareHost,
|
||||
share: smbName,
|
||||
};
|
||||
return volume_context;
|
||||
|
||||
break;
|
||||
default:
|
||||
throw new GrpcError(
|
||||
grpc.status.FAILED_PRECONDITION,
|
||||
`invalid configuration: unknown apiVersion ${apiVersion}`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let volume_context = {
|
||||
}
|
||||
|
||||
volume_context = {
|
||||
node_attach_driver: "smb",
|
||||
server: this.options.smb.shareHost,
|
||||
share: smbName,
|
||||
};
|
||||
return volume_context;
|
||||
}
|
||||
|
||||
break;
|
||||
case "iscsi":
|
||||
properties = await zb.zfs.get(datasetName, [
|
||||
|
|
@ -599,7 +651,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
}
|
||||
|
||||
switch (apiVersion) {
|
||||
case 1: {
|
||||
case 1:
|
||||
response = await httpClient.get(
|
||||
"/services/iscsi/globalconfiguration"
|
||||
);
|
||||
|
|
@ -613,14 +665,46 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
}
|
||||
basename = response.body.iscsi_basename;
|
||||
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
|
||||
break;
|
||||
case 2:
|
||||
response = await httpClient.get("/iscsi/global");
|
||||
if (response.statusCode != 200) {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
`error getting iscsi configuration - code: ${
|
||||
response.statusCode
|
||||
} body: ${JSON.stringify(response.body)}`
|
||||
);
|
||||
}
|
||||
basename = response.body.basename;
|
||||
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
|
||||
break;
|
||||
default:
|
||||
throw new GrpcError(
|
||||
grpc.status.FAILED_PRECONDITION,
|
||||
`invalid configuration: unknown apiVersion ${apiVersion}`
|
||||
);
|
||||
}
|
||||
|
||||
// if we got all the way to the TARGETTOEXTENT then we fully finished
|
||||
// otherwise we must do all assets every time due to the interdependence of IDs etc
|
||||
if (
|
||||
!zb.helpers.isPropertyValueSet(
|
||||
properties[FREENAS_ISCSI_TARGETTOEXTENT_ID_PROPERTY_NAME].value
|
||||
)
|
||||
) {
|
||||
switch (apiVersion) {
|
||||
case 1: {
|
||||
// create target
|
||||
let target = {
|
||||
iscsi_target_name: iscsiName,
|
||||
iscsi_target_alias: "", // TODO: allow template for this
|
||||
};
|
||||
|
||||
response = await httpClient.post("/services/iscsi/target", target);
|
||||
response = await httpClient.post(
|
||||
"/services/iscsi/target",
|
||||
target
|
||||
);
|
||||
|
||||
// 409 if invalid
|
||||
if (response.statusCode != 201) {
|
||||
|
|
@ -675,7 +759,8 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
for (let targetGroupConfig of this.options.iscsi.targetGroups) {
|
||||
let targetGroup = {
|
||||
iscsi_target: target.id,
|
||||
iscsi_target_authgroup: targetGroupConfig.targetGroupAuthGroup,
|
||||
iscsi_target_authgroup:
|
||||
targetGroupConfig.targetGroupAuthGroup,
|
||||
iscsi_target_authtype: targetGroupConfig.targetGroupAuthType
|
||||
? targetGroupConfig.targetGroupAuthType
|
||||
: "None",
|
||||
|
|
@ -766,7 +851,10 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
: Number(extentRpm),
|
||||
iscsi_target_extent_ro: false,
|
||||
};
|
||||
response = await httpClient.post("/services/iscsi/extent", extent);
|
||||
response = await httpClient.post(
|
||||
"/services/iscsi/extent",
|
||||
extent
|
||||
);
|
||||
|
||||
// 409 if invalid
|
||||
if (response.statusCode != 201) {
|
||||
|
|
@ -878,18 +966,6 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
break;
|
||||
}
|
||||
case 2:
|
||||
response = await httpClient.get("/iscsi/global");
|
||||
if (response.statusCode != 200) {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
`error getting iscsi configuration - code: ${
|
||||
response.statusCode
|
||||
} body: ${JSON.stringify(response.body)}`
|
||||
);
|
||||
}
|
||||
basename = response.body.basename;
|
||||
this.ctx.logger.verbose("FreeNAS ISCSI BASENAME: " + basename);
|
||||
|
||||
// create target and targetgroup
|
||||
//let targetId;
|
||||
let targetGroups = [];
|
||||
|
|
@ -927,9 +1003,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
"Target name already exists"
|
||||
)
|
||||
) {
|
||||
target = await this.findResourceByProperties("/iscsi/target", {
|
||||
target = await this.findResourceByProperties(
|
||||
"/iscsi/target",
|
||||
{
|
||||
name: iscsiName,
|
||||
});
|
||||
}
|
||||
);
|
||||
} else {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
|
|
@ -961,9 +1040,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
// TODO: this logic could be more intelligent but this should do for now as it appears in the failure scenario no groups are added
|
||||
// in other words, I have never seen them invalid, only omitted so this should be enough
|
||||
if (target.groups.length != targetGroups.length) {
|
||||
response = await httpClient.put(`/iscsi/target/id/${target.id}`, {
|
||||
response = await httpClient.put(
|
||||
`/iscsi/target/id/${target.id}`,
|
||||
{
|
||||
groups: targetGroups,
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
if (response.statusCode != 200) {
|
||||
throw new GrpcError(
|
||||
|
|
@ -1030,9 +1112,12 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
"Extent name must be unique"
|
||||
)
|
||||
) {
|
||||
extent = await this.findResourceByProperties("/iscsi/extent", {
|
||||
extent = await this.findResourceByProperties(
|
||||
"/iscsi/extent",
|
||||
{
|
||||
name: iscsiName,
|
||||
});
|
||||
}
|
||||
);
|
||||
} else {
|
||||
throw new GrpcError(
|
||||
grpc.status.UNKNOWN,
|
||||
|
|
@ -1133,6 +1218,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
`invalid configuration: unknown apiVersion ${apiVersion}`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// iqn = target
|
||||
let iqn = basename + ":" + iscsiName;
|
||||
|
|
@ -1157,7 +1243,7 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
// iqn
|
||||
// lun
|
||||
|
||||
let volume_context = {
|
||||
volume_context = {
|
||||
node_attach_driver: "iscsi",
|
||||
portal: this.options.iscsi.targetPortal,
|
||||
portals: this.options.iscsi.targetPortals.join(","),
|
||||
|
|
@ -1566,103 +1652,6 @@ class FreeNASDriver extends ControllerZfsSshBaseDriver {
|
|||
}
|
||||
}
|
||||
|
||||
async failedAttachHelper(call, err) {
|
||||
const driverShareType = this.getDriverShareType();
|
||||
const sshClient = this.getSshClient();
|
||||
let response;
|
||||
|
||||
// not fully implemented
|
||||
return;
|
||||
|
||||
switch (driverShareType) {
|
||||
case "iscsi":
|
||||
const isScale = await this.getIsScale();
|
||||
const majorMinor = await this.getSystemVersionMajorMinor();
|
||||
|
||||
// only works for BSD-based and 11.3+
|
||||
if (!isScale && majorMinor >= 11.3) {
|
||||
const sudoEnabled = this.getSudoEnabled();
|
||||
const sudoPath = await this.getSudoPath();
|
||||
let command;
|
||||
|
||||
//19 - encountered non-retryable iSCSI login failure
|
||||
// ^ could be missing groups on the target
|
||||
|
||||
//cat /var/run/ctld.pid
|
||||
// ps -p <pid> | grep ctld
|
||||
// ps -p `cat /var/run/ctld.pid` | grep ctld (if 0 exit status it's running, otherwise no)
|
||||
|
||||
// random settle time
|
||||
// this could be getting invoked by other instances of the same controller
|
||||
// or other deployments of controllers in the same of different clusters
|
||||
// altogether
|
||||
let maxSettleTime = 10000;
|
||||
let settleTime = Math.floor(Math.random() * maxSettleTime + 1);
|
||||
await sleep(settleTime);
|
||||
|
||||
// test if config is bad
|
||||
// if so regen
|
||||
command = sshClient.buildCommand("/usr/sbin/ctld", ["-d"]);
|
||||
if (sudoEnabled) {
|
||||
command = sudoPath + " " + command;
|
||||
}
|
||||
|
||||
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
|
||||
|
||||
response = await sshClient.exec(command);
|
||||
let configError = false;
|
||||
let serviceRunning = false;
|
||||
if (response.stderr.includes("configuration error")) {
|
||||
configError = true;
|
||||
}
|
||||
|
||||
// NOTE: this will not be in the output if the config file has an error
|
||||
if (response.stderr.includes("daemon already running")) {
|
||||
serviceRunning = true;
|
||||
}
|
||||
|
||||
if (configError) {
|
||||
this.ctx.logger.warn(
|
||||
"FailedAttachHelper: ctld appears to have a bad configuration file, attempting to regenerate"
|
||||
);
|
||||
// regen config
|
||||
// midclt call etc.generate ctld
|
||||
command = sshClient.buildCommand("midclt", [
|
||||
"call",
|
||||
"etc.generate",
|
||||
"ctld",
|
||||
]);
|
||||
if (sudoEnabled) {
|
||||
command = sudoPath + " " + command;
|
||||
}
|
||||
|
||||
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
|
||||
response = await sshClient.exec(command);
|
||||
|
||||
// reload service (may not be enough)
|
||||
command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
|
||||
if (sudoEnabled) {
|
||||
command = sudoPath + " " + command;
|
||||
}
|
||||
|
||||
this.ctx.logger.verbose("FailedAttachHelper command: %s", command);
|
||||
response = await sshClient.exec(command);
|
||||
|
||||
}
|
||||
|
||||
// note, when the 'bad' state is entered, the status still shows as running
|
||||
// check if service is running
|
||||
// /etc/rc.d/ctld status ...exits 0 if running
|
||||
//command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
|
||||
|
||||
// if service is not running attempt a restart
|
||||
// /etc/rc.d/ctld restart
|
||||
//command = sshClient.buildCommand("/etc/rc.d/ctld", ["reload"]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
async getApiVersion() {
|
||||
const systemVersion = await this.getSystemVersion();
|
||||
|
||||
|
|
|
|||
|
|
@ -356,15 +356,7 @@ class CsiBaseDriver {
|
|||
nodeDB
|
||||
);
|
||||
// login
|
||||
try {
|
||||
await iscsi.iscsiadm.login(volume_context.iqn, portal);
|
||||
} catch (err) {
|
||||
if (typeof this.failedAttachHelper === "function") {
|
||||
// no need to await this
|
||||
this.failedAttachHelper(call, err);
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
|
||||
// find device name
|
||||
device = `/dev/disk/by-path/ip-${portal}-iscsi-${volume_context.iqn}-lun-${volume_context.lun}`;
|
||||
|
|
@ -386,16 +378,6 @@ class CsiBaseDriver {
|
|||
|
||||
let current_time = Math.round(new Date().getTime() / 1000);
|
||||
if (!result && current_time - timer_start > timer_max) {
|
||||
if (typeof this.failedAttachHelper === "function") {
|
||||
// no need to await this
|
||||
this.failedAttachHelper(
|
||||
call,
|
||||
new Error(
|
||||
`hit timeout waiting for device node to appear: ${device}`
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
driver.ctx.logger.warn(
|
||||
`hit timeout waiting for device node to appear: ${device}`
|
||||
);
|
||||
|
|
|
|||
Loading…
Reference in New Issue