zfs_autobackup 2.4: try to continue on non-fatal errors

This commit is contained in:
Edwin Eefting
2019-10-02 18:21:24 +02:00
parent c176b968a9
commit 54235f455a
2 changed files with 138 additions and 113 deletions

View File

@ -18,7 +18,7 @@ It has the following features:
* Supports resuming of interrupted transfers. (via the zfs extensible_dataset feature)
* Backups and snapshots can be named to prevent conflicts. (multiple backups from and to the same filesystems are no problem)
* Always creates a new snapshot before starting.
* Checks everything and aborts on errors.
* Checks everything but tries continue on non-fatal errors when possible. (Reports error-count when done)
* Ability to 'finish' aborted backups to see what goes wrong.
* Easy to debug and has a test-mode. Actual unix commands are printed.
* Keeps latest X snapshots remote and locally. (default 30, configurable)
@ -42,7 +42,7 @@ usage: zfs_autobackup [-h] [--ssh-source SSH_SOURCE] [--ssh-target SSH_TARGET]
[--debug]
backup_name target_path
ZFS autobackup v2.3
ZFS autobackup v2.4
positional arguments:
backup_name Name of the backup (you should set the zfs property
@ -108,6 +108,9 @@ optional arguments:
(still does all read-only operations)
--verbose verbose output
--debug debug output (shows commands that are executed)
When a filesystem fails, zfs_backup will continue and report the number of
failures at that end. Also the exit code will indicate the number of failures.
```
Backup example

View File

@ -13,18 +13,20 @@ import time
def error(txt):
print(txt, file=sys.stderr)
def verbose(txt):
if args.verbose:
print(txt)
def debug(txt):
if args.debug:
print(txt)
#fatal abort execution, exit code 255
def abort(txt):
error(txt)
sys.exit(255)
"""run a command. specifiy ssh user@host to run remotely"""
def run(cmd, input=None, ssh_to="local", tab_split=False, valid_exitcodes=[ 0 ], test=False):
@ -473,6 +475,15 @@ def zfs_get_unchanged_filesystems(ssh_to, filesystems):
#fugly..
failures=0
#something failed, but we try to continue with the rest
def failed(txt):
global failures
failures=failures+1
error("FAILURE: "+txt+"\n")
def zfs_autobackup():
############## data gathering section
@ -490,8 +501,7 @@ def zfs_autobackup():
#nothing todo
if not source_filesystems:
error("No source filesystems selected, please do a 'zfs set autobackup:{0}=true' on {1}".format(args.backup_name,args.ssh_source))
sys.exit(1)
abort("No source filesystems selected, please do a 'zfs set autobackup:{0}=true' on {1}".format(args.backup_name,args.ssh_source))
if args.ignore_replicated:
replicated_filesystems=zfs_get_unchanged_filesystems(args.ssh_source, source_filesystems)
@ -540,7 +550,6 @@ def zfs_autobackup():
### get eixsting source snapshots
verbose("Getting source snapshot-list from {0}".format(args.ssh_source))
source_snapshots=zfs_get_snapshots(args.ssh_source, source_filesystems, args.backup_name)
debug("Source snapshots:\n" + str(pprint.pformat(source_snapshots)))
@ -588,6 +597,7 @@ def zfs_autobackup():
#determine which snapshots to send for each filesystem
for source_filesystem in source_filesystems:
try:
target_filesystem=args.target_path + "/" + lstrip_path(source_filesystem, args.strip_path)
if source_filesystem not in source_snapshots:
@ -604,7 +614,7 @@ def zfs_autobackup():
if latest_target_snapshot not in source_snapshots[source_filesystem]:
#cant find latest target anymore. find first common snapshot and inform user
error_msg="Cant find latest target snapshot on source, did you destroy/rename it?"
error_msg="Cant find latest target snapshot on source for '{}', did you destroy/rename it?".format(source_filesystem)
error_msg=error_msg+"\nLatest on target : "+target_filesystem+"@"+latest_target_snapshot
error_msg=error_msg+"\nMissing on source: "+source_filesystem+"@"+latest_target_snapshot
found=False
@ -617,7 +627,7 @@ def zfs_autobackup():
error_msg=error_msg+"\nAlso could not find an earlier common snapshot to rollback to."
else:
if args.ignore_new:
verbose("* Skipping source filesystem {0}, target already has newer snapshots.".format(source_filesystem))
verbose("* Skipping source filesystem '{0}', target already has newer snapshots.".format(source_filesystem))
continue
raise(Exception(error_msg))
@ -695,7 +705,9 @@ def zfs_autobackup():
latest_target_snapshot=send_snapshot
# failed, skip this source_filesystem
except Exception as e:
failed(str(e))
############## cleanup section
@ -730,23 +742,28 @@ def zfs_autobackup():
source_destroys=determine_destroy_list(source_obsolete_snapshots, args.keep_source)
if source_destroys:
verbose("Destroying old snapshots on source {0}:\n{1}".format(args.ssh_source, "\n".join(source_destroys)))
try:
zfs_destroy_snapshots(ssh_to=args.ssh_source, snapshots=source_destroys)
except Exception as e:
failed(str(e))
target_destroys=determine_destroy_list(target_obsolete_snapshots, args.keep_target)
if target_destroys:
verbose("Destroying old snapshots on target {0}:\n{1}".format(args.ssh_target, "\n".join(target_destroys)))
try:
zfs_destroy_snapshots(ssh_to=args.ssh_target, snapshots=target_destroys)
verbose("All done")
except Exception as e:
failed(str(e))
################################################################## ENTRY POINT
# parse arguments
import argparse
parser = argparse.ArgumentParser(description='ZFS autobackup v2.3')
parser = argparse.ArgumentParser(
description='ZFS autobackup v2.4',
epilog='When a filesystem fails, zfs_backup will continue and report the number of failures at that end. Also the exit code will indicate the number of failures.')
parser.add_argument('--ssh-source', default="local", help='Source host to get backup from. (user@hostname) Default %(default)s.')
parser.add_argument('--ssh-target', default="local", help='Target host to push backup to. (user@hostname) Default %(default)s.')
parser.add_argument('--keep-source', type=int, default=30, help='Number of days to keep old snapshots on source. Default %(default)s.')
@ -782,17 +799,22 @@ parser.add_argument('--debug', action='store_true', help='debug output (shows co
args = parser.parse_args()
if args.ignore_replicated and args.allow_empty:
print("Cannot use allow_empty with ignore_replicated.")
sys.exit(1)
abort("Cannot use allow_empty with ignore_replicated.")
try:
zfs_autobackup()
if not failures:
verbose("All operations completed succesfully.")
sys.exit(0)
else:
verbose("{} OPERATION(S) FAILED!".format(failures))
#exit with the number of failures.
sys.exit(min(255,failed))
except Exception as e:
if args.debug:
raise
else:
print("ABORTED")
print(str(e))
sys.exit(1)
abort("FATAL ERROR")