From 2a5966a6517e3e99062c5796200389fb591091f6 Mon Sep 17 00:00:00 2001 From: Ludovic Pouzenc Date: Fri, 21 Aug 2015 17:54:11 +0200 Subject: Adding RAID 5+0 support, memory fixes and improovements for big disks --- mytasks.py | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) (limited to 'mytasks.py') diff --git a/mytasks.py b/mytasks.py index 225064b..21ae365 100644 --- a/mytasks.py +++ b/mytasks.py @@ -61,17 +61,18 @@ def do_find_files(d,state): start = 0 end = min(d.disks_size) - one_percent = (end - start) / 100 - one_percent = one_percent + ( (-one_percent)%512 ) - logging.debug("start/end/1pc : %i / %i / %i"%(start,end,one_percent)) + one_per_thousand = (end - start) / 1000 + one_per_thousand = one_per_thousand + ( (-one_per_thousand)%512 ) + logging.debug("start/end/1pm : %i / %i / %i"%(start,end,one_per_thousand)) state['found'] = [] - state['progress'] = 0 + state['progress'] = 0.0 state['raw_matches'] = 0 state['state'] = 'searching' raw_matches = [] - for offset in range(start, end, 512): + for offset in xrange(start, end, 512): for disk_no in range(d.disk_count): + # Read disks sector by sector and try to match sectors present in ref_files (through ref_big_hash) d.disks[disk_no].seek(offset) data = d.disks[disk_no].read(512) if data in ref_big_hash: @@ -83,13 +84,32 @@ def do_find_files(d,state): 'disk_offset': offset } raw_matches.append(a) - logging.info('raw_matches.append(%s)'%a) + logging.info("raw_matches.append(disk%02i@0x%011x <=> '%s'@0x%011x)"%(a['disk_no'], a['disk_offset'], a['ref_no'], a['ref_offset'])) state['raw_matches'] = len(raw_matches) - if offset % one_percent == 0: - state['progress'] = state['progress'] + 1 + + if offset % one_per_thousand == 0: + state['progress'] = state['progress'] + 0.1 agg = state['found'] state['state'] = 'aggregating' + + # Blacklist ref_file sectors that is far too common in disk data + idx_matches = {} + for rm in raw_matches: + k = (rm['ref_no'], rm['ref_offset']) + if k in idx_matches: + idx_matches[k] += 1 + else: + idx_matches[k] = 1 + + for k in idx_matches.keys(): + if idx_matches[k] > 5: + logging.info("Skipping too many raw_matches for '%s'@0x%011x"%k) + logging.debug("Before filter : %i"%len(raw_matches)) + raw_matches = filter(lambda rm: k != (rm['ref_no'], rm['ref_offset']), raw_matches) + logging.debug("After filter : %i"%len(raw_matches)) + + # Aggregate raw_matches found = True while found: found = False @@ -163,15 +183,15 @@ def do_find_files(d,state): # end while not found - if len(agg) < 200: - state['found'] = agg + if len(agg) < 10000: + state['found'] = filter(lambda x: len(x['block_list']) > 1, agg) state['state'] = 'searching' else: state['state'] = 'aborted' raise Exception('Aborting after too many matches') state['state'] = 'finished' - state['progress'] = 100 + state['progress'] = 100.0 except Exception as e: logging.exception(e) finally: @@ -195,7 +215,7 @@ def do_find_bootsect(d,state): state['found'] = [] state['progress'] = 0 state['state'] = 'searching' - for offset in range(start, end, 512): + for offset in xrange(start, end, 512): for disk_no in range(d.disk_count): d.disks[disk_no].seek(offset) data = d.disks[disk_no].read(512) -- cgit v1.2.3