From 34d0c5d67bd483239ad625265226327b1d7a417c Mon Sep 17 00:00:00 2001
From: Edwin Eefting <edwin@datux.nl>
Date: Tue, 22 Oct 2019 20:24:43 +0200
Subject: [PATCH] completed progressive thinner class

---
 zfs_autobackup | 340 +++++++++++++++++++++++++++++--------------------
 1 file changed, 199 insertions(+), 141 deletions(-)

diff --git a/zfs_autobackup b/zfs_autobackup
index 6813aab..92ad024 100755
--- a/zfs_autobackup
+++ b/zfs_autobackup
@@ -79,6 +79,174 @@ def abort(txt):
     sys.exit(255)
 
 
+class ThinnerRule:
+    """a thinning schedule rule for Thinner"""
+
+    TIME_NAMES={
+            'y'   : 3600 * 24 * 365.25,
+            'm'   : 3600 * 24 * 30,
+            'w'   : 3600 * 24 * 7,
+            'd'   : 3600 * 24,
+            'h'   : 3600,
+            'min' : 60,
+            's'   : 1,
+    }
+
+    def parse_rule(self, rule_str):
+        """parse scheduling string
+            example:
+                daily snapshot, remove after a week:     1d1w
+                weekly snapshot, remove after a month:   1w1m
+                monthly snapshot, remove after 6 months: 1m6m
+                yearly snapshot, remove after 2 year:    1y2y
+                keep all snapshots, remove after a day   1s1d
+                keep nothing:                            1s1s
+
+        """
+
+        rule_str=rule_str.lower()
+        matches=re.findall("([0-9]*)([a-z]*)([0-9]*)([a-z]*)", rule_str)[0]
+
+        period_amount=int(matches[0])
+        period_unit=matches[1]
+        ttl_amount=int(matches[2])
+        ttl_unit=matches[3]
+
+        if not period_unit in self.TIME_NAMES:
+            raise(Exception("Invalid period string in schedule: '{}'".format(rule_str)))
+
+        if not ttl_unit in self.TIME_NAMES:
+            raise(Exception("Invalid ttl string in schedule: '{}'".format(rule_str)))
+
+
+        self.period=period_amount * self.TIME_NAMES[period_unit]
+        self.ttl=ttl_amount * self.TIME_NAMES[ttl_unit]
+
+        if self.period>self.ttl:
+            raise(Exception("Period cant be longer than ttl in schedule: '{}'".format(rule_str)))
+
+
+        self.rule_str=rule_str
+
+
+    def __str__(self):
+        """get schedule as a schedule string"""
+
+        return(self.rule_str)
+
+    def __init__(self, rule_str):
+        self.parse_rule(rule_str)
+        pass
+
+
+class Thinner:
+    """progressive thinner (universal, used for cleaning up snapshots)"""
+
+    def __init__(self, schedule_str, always_keep=1):
+        """schedule_str: comman seperated list of ThinnerRules
+        always_keep: always keep the last X snapshots
+        """
+
+        self.always_keep=always_keep
+        self.rules=[]
+
+        rule_strs=schedule_str.split(",")
+        for rule_str in rule_strs:
+            self.rules.append(ThinnerRule(rule_str))
+
+    def run(self,objects, now=None):
+        """thin list of objects with current schedule rules.
+        object should have timestamp-attribute with unix timestamp
+
+            return( keeps, removes )
+        """
+
+        if len(objects)<=self.always_keep:
+            return ( (objects, []) )
+
+        time_blocks={}
+        for rule in self.rules:
+            time_blocks[rule.period]={}
+
+        if not now:
+            now=int(time.time())
+
+        keeps=[]
+        removes=[]
+
+        #traverse objects
+        for object in objects[:-self.always_keep]:
+
+            timestamp=object.timestamp
+            age=now-timestamp
+
+            # store in the correct time blocks, per period-size, if not too old yet
+            keep=False
+            for rule in self.rules:
+                if age<=rule.ttl:
+                    block_nr=int(timestamp/rule.period)
+                    if not block_nr in time_blocks[rule.period]:
+                        time_blocks[rule.period][block_nr]=True
+                        keep=True
+
+            if keep:
+                keeps.append(object)
+            else:
+                removes.append(object)
+
+        keeps.extend(objects[-self.always_keep:])
+
+        return( (keeps, removes) )
+
+
+
+######### Thinner testing code
+now=int(time.time())
+
+t=Thinner("1d1w,1w1m,1m6m,1y2y", always_keep=1)
+
+import random
+
+class Thing:
+    def __init__(self, timestamp):
+        self.timestamp=timestamp
+
+    def __str__(self):
+        age=now-self.timestamp
+        struct=time.localtime(self.timestamp)
+        return("{} ({} days old)".format(time.strftime("%Y-%m-%d %H:%M:%S",struct),int(age/(3600*24))))
+
+def test():
+    global now
+    things=[]
+
+    while True:
+        print("#################### {}".format(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now))))
+
+        (keeps, removes)=t.run(things, now)
+
+        print ("### KEEP ")
+        for thing in keeps:
+            print(thing)
+
+        print ("### REMOVE ")
+        for thing in removes:
+            print(thing)
+
+        things=keeps
+
+        #increase random amount of time and maybe add a thing
+        now=now+random.randint(0,160000)
+        if random.random()>=0:
+            things.append(Thing(now))
+
+        sys.stdin.readline()
+
+test()
+
+
+
+
 class cached_property(object):
     """ A property that is only computed once per instance and then replaces
         itself with an ordinary attribute. Deleting the attribute resets the
@@ -297,10 +465,12 @@ class ZfsDataset():
         #TODO: nicer?
         self._cached_properties={}
 
+
     def lstrip_path(self,count):
         """return name with first count components stripped"""
         return("/".join(self.name.split("/")[count:]))
 
+
     def rstrip_path(self,count):
         """return name with last count components stripped"""
         return("/".join(self.name.split("/")[:-count]))
@@ -312,12 +482,14 @@ class ZfsDataset():
         (filesystem, snapshot_name)=self.name.split("@")
         return(filesystem)
 
+
     @property
     def snapshot_name(self):
         """snapshot part of the name"""
         (filesystem, snapshot_name)=self.name.split("@")
         return(snapshot_name)
 
+
     @property
     def is_snapshot(self):
         """true if this dataset is a snapshot"""
@@ -336,12 +508,14 @@ class ZfsDataset():
         else:
             return(ZfsDataset(self.zfs_node, self.rstrip_path(1)))
 
+
     @cached_property
     def exists(self):
         """check if dataset exists"""
         self.debug("Checking if filesystem exists")
         return(self.zfs_node.run(tab_split=True, cmd=[ "zfs", "list", self.name], readonly=True, valid_exitcodes=[ 0,1 ], hide_errors=True) and True)
 
+
     def create_filesystem(self, parents=False):
         """create a filesytem"""
         if parents:
@@ -354,11 +528,13 @@ class ZfsDataset():
         #update cache
         self.exists=1
 
+
     def destroy(self):
         self.debug("Destroying")
         self.zfs_node.run(["zfs", "destroy", self.name])
         self.invalidate()
 
+
     @cached_property
     def properties(self):
         """all zfs properties"""
@@ -370,6 +546,7 @@ class ZfsDataset():
 
         return(dict(self.zfs_node.run(tab_split=True, cmd=cmd, readonly=True, valid_exitcodes=[ 0 ])))
 
+
     def is_changed(self):
         """dataset is changed since ANY latest snapshot ?"""
         self.debug("Checking if dataset is changed")
@@ -379,6 +556,7 @@ class ZfsDataset():
         else:
             return(True)
 
+
     def is_ours(self):
         """return true if this snapshot is created by this backup_nanme"""
         if re.match("^"+self.zfs_node.backup_name+"-[0-9]*$", self.snapshot_name):
@@ -386,6 +564,19 @@ class ZfsDataset():
         else:
             return(False)
 
+
+    @property
+    def timestamp(self):
+        """get timestamp from snapshot name. Only works for our own snapshots with the correct format."""
+        time_str=re.findall("^.*-([0-9]*)$", self.snapshot_name)[0]
+        if len(time_str)!=14:
+            raise(Exception("Snapshot has invalid timestamp in name: {}".format(self.snapshot_name)))
+
+        #new format:
+        time_secs=time.mktime(time.strptime(time_str,"%Y%m%d%H%M%S"))
+        return(time_str)
+
+
     def from_names(self, names):
         """convert a list of names to a list ZfsDatasets for this zfs_node"""
         ret=[]
@@ -813,152 +1004,19 @@ class ZfsAutobackup:
                     raise
 
 
-times=[]
-
-
-time_blocks={
-        'years'   : 3600 * 24 * 365.25,
-        'months'  : 3600 * 24 * 30,
-        'weeks'   : 3600 * 24 * 7,
-        'days'    : 3600 * 24,
-        'hours'   : 3600,
-        'minutes' : 60,
-}
-
-
-
-now=int(time.time())
-
-def thin(schedule, snapshots):
-    if len(snapshots)==0:
-        return(snapshots)
-
-    ret=[]
-
-    time_blocks={}
-
-    for ( period, ttl ) in schedule:
-        time_blocks[period]={}
-
-    # for snapshot in list(reversed(snapshots)):
-    #always keep latest
-    for snapshot in snapshots:
-
-        snapshot_time=snapshot
-
-        keeps=""
-        # just store in the correct time blocks, per period-size
-        for ( period, ttl ) in schedule:
-            block_nr=int(snapshot_time/period)
-            if not block_nr in time_blocks[period]:
-                time_blocks[period][block_nr]=[]
-            time_blocks[period][block_nr].append(snapshot_time)
-
-
-
-    keep=set()
-
-    #now get the oldest one within the ttl, per block
-    for ( period, ttl ) in schedule:
-        for ( block_nr, snapshots ) in time_blocks[period].items():
-            for snapshot_time in sorted(snapshots):
-                age=now-snapshot_time
-                if age<ttl:
-                    keep.add(snapshot_time)
-                    break
-
-
-    return (sorted(keep))
-
-
-    # return(list(reversed(ret)))
-
-    #always keep latest!
-    # if not keeps and snapshots:
-    # #     ret.append(snapshots[:-1])
-    # struct=time.localtime(snapshot_time)
-    # if keeps:
-    #     ret.append(snapshot)
-    #     print("{} {} {}days".format(time.strftime("%Y-%m-%d %H:%M:%S",struct),keeps,int(age/(3600*24))))
-    #     # else:
-    #     #     print("{}".format(time.strftime("%Y-%m-%d %H:%M:%S",struct)))
-    #
-    #
-    # p(time_blocks)
-
-    # ret.append(snapshots[-1])
-    # struct=time.localtime(snapshots[-1])
-    # print("{}".format(time.strftime("%Y-%m-%d %H:%M:%S",struct)))
-    # return(ret)
-
-# snapshots=range(now-400*24*3600, now, 24*3600)
-
-schedule=[
-    #every ...               keep for ...
-    ( 1*time_blocks['days']  , 4 * time_blocks['days'] ),
-    ( 1*time_blocks['weeks'] , 4 * time_blocks['weeks'] ),
-     ( 1*time_blocks['months'], (6 * time_blocks['months']) ),
-    ( 1*time_blocks['years'], 2* time_blocks['years'] ),
-
-]
-
-
-
-import random
-
-
-
-def printsnap(s):
-    age=now-s
-    struct=time.localtime(s)
-    return("{} {}days".format(time.strftime("%Y-%m-%d %H:%M:%S",struct),int(age/(3600*24))))
-
-
-def test():
-    global now
-    a=[]
-    b=[]
-
-    while True:
-        print("#################### {}".format(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(now))))
-
-        # if random.random()>0.5:
-        a.append(now)
-        a=thin(schedule,a)
-        # b.append(now)
-        # b=thin(schedule,a, oldest=False)
-        b=[]
-
-
-        for count in range(0,max(len(a), len(b))):
-            sa=""
-            if count<len(a):
-                sa=printsnap(a[count])
-
-            sb=""
-            if count<len(b):
-                sb=printsnap(b[count])
-
-            print("{:15}  |  {:15}".format(sa,sb))
-
-
-
-        # for s in msnapshots:
-        #     age=now-s
-        #     struct=time.localtime(s)
-        #     print("{} {}days".format(time.strftime("%Y-%m-%d %H:%M:%S",struct),int(age/(3600*24))))
-
-
-        sys.stdin.readline()
-        now=now+random.randint(0,800000)
-        # msnapshots.insert(0,now)
-
-test()
+#times=[]
 
 
 
 
 
+#
+# test()
+#
+#
+#
+#
+#
 #
 #
 #