summaryrefslogtreecommitdiff
path: root/mytasks.py
diff options
context:
space:
mode:
Diffstat (limited to 'mytasks.py')
-rw-r--r--mytasks.py44
1 files changed, 32 insertions, 12 deletions
diff --git a/mytasks.py b/mytasks.py
index 225064b..21ae365 100644
--- a/mytasks.py
+++ b/mytasks.py
@@ -61,17 +61,18 @@ def do_find_files(d,state):
start = 0
end = min(d.disks_size)
- one_percent = (end - start) / 100
- one_percent = one_percent + ( (-one_percent)%512 )
- logging.debug("start/end/1pc : %i / %i / %i"%(start,end,one_percent))
+ one_per_thousand = (end - start) / 1000
+ one_per_thousand = one_per_thousand + ( (-one_per_thousand)%512 )
+ logging.debug("start/end/1pm : %i / %i / %i"%(start,end,one_per_thousand))
state['found'] = []
- state['progress'] = 0
+ state['progress'] = 0.0
state['raw_matches'] = 0
state['state'] = 'searching'
raw_matches = []
- for offset in range(start, end, 512):
+ for offset in xrange(start, end, 512):
for disk_no in range(d.disk_count):
+ # Read disks sector by sector and try to match sectors present in ref_files (through ref_big_hash)
d.disks[disk_no].seek(offset)
data = d.disks[disk_no].read(512)
if data in ref_big_hash:
@@ -83,13 +84,32 @@ def do_find_files(d,state):
'disk_offset': offset
}
raw_matches.append(a)
- logging.info('raw_matches.append(%s)'%a)
+ logging.info("raw_matches.append(disk%02i@0x%011x <=> '%s'@0x%011x)"%(a['disk_no'], a['disk_offset'], a['ref_no'], a['ref_offset']))
state['raw_matches'] = len(raw_matches)
- if offset % one_percent == 0:
- state['progress'] = state['progress'] + 1
+
+ if offset % one_per_thousand == 0:
+ state['progress'] = state['progress'] + 0.1
agg = state['found']
state['state'] = 'aggregating'
+
+ # Blacklist ref_file sectors that is far too common in disk data
+ idx_matches = {}
+ for rm in raw_matches:
+ k = (rm['ref_no'], rm['ref_offset'])
+ if k in idx_matches:
+ idx_matches[k] += 1
+ else:
+ idx_matches[k] = 1
+
+ for k in idx_matches.keys():
+ if idx_matches[k] > 5:
+ logging.info("Skipping too many raw_matches for '%s'@0x%011x"%k)
+ logging.debug("Before filter : %i"%len(raw_matches))
+ raw_matches = filter(lambda rm: k != (rm['ref_no'], rm['ref_offset']), raw_matches)
+ logging.debug("After filter : %i"%len(raw_matches))
+
+ # Aggregate raw_matches
found = True
while found:
found = False
@@ -163,15 +183,15 @@ def do_find_files(d,state):
# end while not found
- if len(agg) < 200:
- state['found'] = agg
+ if len(agg) < 10000:
+ state['found'] = filter(lambda x: len(x['block_list']) > 1, agg)
state['state'] = 'searching'
else:
state['state'] = 'aborted'
raise Exception('Aborting after too many matches')
state['state'] = 'finished'
- state['progress'] = 100
+ state['progress'] = 100.0
except Exception as e:
logging.exception(e)
finally:
@@ -195,7 +215,7 @@ def do_find_bootsect(d,state):
state['found'] = []
state['progress'] = 0
state['state'] = 'searching'
- for offset in range(start, end, 512):
+ for offset in xrange(start, end, 512):
for disk_no in range(d.disk_count):
d.disks[disk_no].seek(offset)
data = d.disks[disk_no].read(512)