From 0670424be379bc5a9e5d769147f1378d1238b3c2 Mon Sep 17 00:00:00 2001
From: Ludovic Pouzenc <lpouzenc@gmail.com>
Date: Sun, 19 Jul 2015 13:09:15 +0200
Subject: find_files : aggregation des secteurs trouves parmis les fichiers de
 référence. Reste à aggreger les block-lists elles même et a tester les cas où
 la suite des données est lue avant (même offset disk, ordre inversé)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mytasks.py | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 104 insertions(+), 14 deletions(-)

diff --git a/mytasks.py b/mytasks.py
index dcb53a8..ab5d63f 100644
--- a/mytasks.py
+++ b/mytasks.py
@@ -24,7 +24,7 @@ import mydisks
 def do_find_files(d,state):
     logging.info("Enter do_find_files()")
     try:
-        state['state'] = 'initializing'
+        state['state'] = 'loading'
         ref_paths = state['filepaths']
         ref_count = len(ref_paths)
         ref_big_hash = {}
@@ -33,18 +33,30 @@ def do_find_files(d,state):
             logging.debug("Try to open ref. file '%s'"%path)
             with open(path, 'rb') as fd:
                 logging.info("Loading ref. file '%s'"%path)
+                warn_empty = True
+                warn_dup = True;
                 while True:
                     ref_offset = fd.tell()
                     data = fd.read(512)
                     if not data:
                         break
                     if data == '\0'*512:
-                        logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset))
+                        if warn_empty:
+                            logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset))
+                            warn_empty = False
+                        else:
+                            logging.debug("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset))
                     elif data in ref_big_hash:
                         (prev_ref_no, prev_ref_offset) = ref_big_hash[data]
-                        logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"%
-                                (prev_ref_no, prev_ref_offset, ref_no, ref_offset))
+                        if warn_dup:
+                            logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"%
+                                    (prev_ref_no, prev_ref_offset, ref_no, ref_offset))
+                            warn_dump = False
+                        else:
+                            logging.debug("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"%
+                                    (prev_ref_no, prev_ref_offset, ref_no, ref_offset))
                     else:
+                        # Hash in memory the whole read sector and store it's provenance
                         ref_big_hash[data] = (ref_no, ref_offset)
 
         start = 0
@@ -55,29 +67,107 @@ def do_find_files(d,state):
 
         state['found'] = []
         state['progress'] = 0
+        state['raw_matches'] = 0
         state['state'] = 'searching'
+        raw_matches = []
         for offset in range(start, end, 512):
             for disk_no in range(d.disk_count):
                 d.disks[disk_no].seek(offset)
                 data = d.disks[disk_no].read(512)
                 if data in ref_big_hash:
-                        f = state['found']
-                        if len(f) < 200:
-                            # TODO agreger les matches
-                            (ref_no, ref_offset) = ref_big_hash[data]
-                            f.append((ref_no,ref_offset,disk_no,offset))
-                            state['found'] = f
-                        else:
-                            state['state'] = 'aborted'
-                            raise Exception('Aborting after too many matches')
+                    (ref_no, ref_offset) = ref_big_hash[data]
+                    a = {
+                        'ref_no': ref_no,
+                        'ref_offset': ref_offset,
+                        'disk_no': disk_no,
+                        'disk_offset': offset
+                        }
+                    raw_matches.append(a)
+                    logging.info('raw_matches.append(%s)'%a)
+                    state['raw_matches'] = len(raw_matches)
             if offset % one_percent == 0:
                 state['progress'] = state['progress'] + 1
+                agg = state['found']
+
+                state['state'] = 'aggregating'
+                found = True
+                while found:
+                    found = False
+                    i = 0
+                    rm_len = len(raw_matches)
+                    while not found and i < rm_len:
+                        x = raw_matches[i]
+
+                        # Try to find an aggregated item that ends just before our ref_offset
+                        if not found:
+                            for a in agg:
+                                if a['ref_no'] == x['ref_no'] and x['ref_offset'] == a['ref_offset_end']:
+                                    a['ref_offset_end'] = x['ref_offset'] + 512
+                                    a['block_list'].append((x['disk_no'], x['disk_offset']))
+                                    x['consumed'] = True
+                                    found = True
+                                    break
+
+                        # Try to find an aggregated item that starts just after our ref_offset
+                        if not found:
+                            for a in agg:
+                                if a['ref_no'] == x['ref_no'] and x['ref_offset'] + 512 == a['ref_offset_start']:
+                                    a['ref_offset_start'] = x['ref_offset']
+                                    a['block_list'].insert(0,(x['disk_no'], x['disk_offset']))
+                                    x['consumed'] = True
+                                    found = True
+                                    break
+
+                        # Try to find another match right before or after the current one to create an aggregate
+                        if not found:
+                            for x2 in raw_matches:
+                                if x2 != x and x2['ref_no'] == x['ref_no'] and abs(x2['ref_offset'] - x['ref_offset']) == 512:
+                                    if x2['ref_offset'] > x['ref_offset']:
+                                        first = x
+                                        last = x2
+                                    else:
+                                        first = x2
+                                        last = x
+                                
+                                    a = {
+                                        'ref_no': ref_no,
+                                        'ref_offset_start': first['ref_offset'],
+                                        'ref_offset_end': last['ref_offset'] + 512,
+                                        'block_list': [ 
+                                            (first['disk_no'], first['disk_offset']),
+                                            (last['disk_no'], last['disk_offset']),
+                                            ]
+                                        }
+                                    logging.info('agg.append(%s)'%a)
+                                    agg.append(a)
+                                    x['consumed'] = True
+                                    x2['consumed'] = True
+                                    found = True
+                                    break
+                        # Orphans are not injected in agg
+                        i += 1
+                    # end while ( x = raw_matches[i] )
+                    # Remove all consumed items from raw_matches list before next While iteration
+                    if found == True:
+                        raw_matches = filter(lambda x: not 'consumed' in x, raw_matches)
+                        state['raw_matches'] = len(raw_matches)
+
+                # end while not found
+
+                if len(agg) < 200:
+                    state['found'] = agg
+                    state['state'] = 'searching'
+                else:
+                    state['state'] = 'aborted'
+                    raise Exception('Aborting after too many matches')
 
-        ref_big_hash.clear()
         state['state'] = 'finished'
         state['progress'] = 100
     except Exception as e:
         logging.exception(e)
+    finally:
+        ref_big_hash.clear()
+
     logging.info("Exit. do_find_files()")
 
 
-- 
cgit v1.2.3