1 files changed, 32 insertions, 12 deletions
diff --git a/mytasks.py b/mytasks.py
index 225064b..21ae365 100644
--- a/mytasks.py
+++ b/mytasks.py
@@ -61,17 +61,18 @@ def do_find_files(d,state):
 
         start = 0
         end = min(d.disks_size)
-        one_percent = (end - start) / 100
-        one_percent = one_percent + ( (-one_percent)%512 )
-        logging.debug("start/end/1pc : %i / %i / %i"%(start,end,one_percent))
+        one_per_thousand = (end - start) / 1000
+        one_per_thousand = one_per_thousand + ( (-one_per_thousand)%512 )
+        logging.debug("start/end/1pm : %i / %i / %i"%(start,end,one_per_thousand))
 
         state['found'] = []
-        state['progress'] = 0
+        state['progress'] = 0.0
         state['raw_matches'] = 0
         state['state'] = 'searching'
         raw_matches = []
-        for offset in range(start, end, 512):
+        for offset in xrange(start, end, 512):
             for disk_no in range(d.disk_count):
+		# Read disks sector by sector and try to match sectors present in ref_files (through ref_big_hash)
                 d.disks[disk_no].seek(offset)
                 data = d.disks[disk_no].read(512)
                 if data in ref_big_hash:
@@ -83,13 +84,32 @@ def do_find_files(d,state):
                         'disk_offset': offset
                         }
                     raw_matches.append(a)
-                    logging.info('raw_matches.append(%s)'%a)
+                    logging.info("raw_matches.append(disk%02i@0x%011x <=> '%s'@0x%011x)"%(a['disk_no'], a['disk_offset'], a['ref_no'], a['ref_offset']))
                     state['raw_matches'] = len(raw_matches)
-            if offset % one_percent == 0:
-                state['progress'] = state['progress'] + 1
+            
+            if offset % one_per_thousand == 0:
+                state['progress'] = state['progress'] + 0.1
                 agg = state['found']
 
                 state['state'] = 'aggregating'
+
+		# Blacklist ref_file sectors that is far too common in disk data
+		idx_matches = {}
+		for rm in raw_matches:
+			k = (rm['ref_no'], rm['ref_offset'])
+			if k in idx_matches:
+				idx_matches[k] += 1
+			else:
+				idx_matches[k] = 1
+
+		for k in idx_matches.keys():
+			if idx_matches[k] > 5:
+				logging.info("Skipping too many raw_matches for '%s'@0x%011x"%k)
+				logging.debug("Before filter : %i"%len(raw_matches))
+				raw_matches = filter(lambda rm: k != (rm['ref_no'], rm['ref_offset']), raw_matches)
+				logging.debug("After filter : %i"%len(raw_matches))
+
+		# Aggregate raw_matches
                 found = True
                 while found:
                     found = False
@@ -163,15 +183,15 @@ def do_find_files(d,state):
 
                 # end while not found
 
-                if len(agg) < 200:
-                    state['found'] = agg
+                if len(agg) < 10000:
+                    state['found'] = filter(lambda x: len(x['block_list']) > 1, agg)
                     state['state'] = 'searching'
                 else:
                     state['state'] = 'aborted'
                     raise Exception('Aborting after too many matches')
 
         state['state'] = 'finished'
-        state['progress'] = 100
+        state['progress'] = 100.0
     except Exception as e:
         logging.exception(e)
     finally:
@@ -195,7 +215,7 @@ def do_find_bootsect(d,state):
         state['found'] = []
         state['progress'] = 0
         state['state'] = 'searching'
-        for offset in range(start, end, 512):
+        for offset in xrange(start, end, 512):
             for disk_no in range(d.disk_count):
                 d.disks[disk_no].seek(offset)
                 data = d.disks[disk_no].read(512)