From 0670424be379bc5a9e5d769147f1378d1238b3c2 Mon Sep 17 00:00:00 2001 From: Ludovic Pouzenc Date: Sun, 19 Jul 2015 13:09:15 +0200 Subject: find_files : aggregation des secteurs trouves parmis les fichiers de référence. Reste à aggreger les block-lists elles même et a tester les cas où la suite des données est lue avant (même offset disk, ordre inversé) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- mytasks.py | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 104 insertions(+), 14 deletions(-) diff --git a/mytasks.py b/mytasks.py index dcb53a8..ab5d63f 100644 --- a/mytasks.py +++ b/mytasks.py @@ -24,7 +24,7 @@ import mydisks def do_find_files(d,state): logging.info("Enter do_find_files()") try: - state['state'] = 'initializing' + state['state'] = 'loading' ref_paths = state['filepaths'] ref_count = len(ref_paths) ref_big_hash = {} @@ -33,18 +33,30 @@ def do_find_files(d,state): logging.debug("Try to open ref. file '%s'"%path) with open(path, 'rb') as fd: logging.info("Loading ref. file '%s'"%path) + warn_empty = True + warn_dup = True; while True: ref_offset = fd.tell() data = fd.read(512) if not data: break if data == '\0'*512: - logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset)) + if warn_empty: + logging.info("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset)) + warn_empty = False + else: + logging.debug("Ignoring empty sector in '%s'@0x%011x"%(path,ref_offset)) elif data in ref_big_hash: (prev_ref_no, prev_ref_offset) = ref_big_hash[data] - logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"% - (prev_ref_no, prev_ref_offset, ref_no, ref_offset)) + if warn_dup: + logging.info("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"% + (prev_ref_no, prev_ref_offset, ref_no, ref_offset)) + warn_dump = False + else: + logging.debug("Non-unique sector found in ref. files ('%s'@0x%011x and '%s'@0x%011x)"% + (prev_ref_no, prev_ref_offset, ref_no, ref_offset)) else: + # Hash in memory the whole read sector and store it's provenance ref_big_hash[data] = (ref_no, ref_offset) start = 0 @@ -55,29 +67,107 @@ def do_find_files(d,state): state['found'] = [] state['progress'] = 0 + state['raw_matches'] = 0 state['state'] = 'searching' + raw_matches = [] for offset in range(start, end, 512): for disk_no in range(d.disk_count): d.disks[disk_no].seek(offset) data = d.disks[disk_no].read(512) if data in ref_big_hash: - f = state['found'] - if len(f) < 200: - # TODO agreger les matches - (ref_no, ref_offset) = ref_big_hash[data] - f.append((ref_no,ref_offset,disk_no,offset)) - state['found'] = f - else: - state['state'] = 'aborted' - raise Exception('Aborting after too many matches') + (ref_no, ref_offset) = ref_big_hash[data] + a = { + 'ref_no': ref_no, + 'ref_offset': ref_offset, + 'disk_no': disk_no, + 'disk_offset': offset + } + raw_matches.append(a) + logging.info('raw_matches.append(%s)'%a) + state['raw_matches'] = len(raw_matches) if offset % one_percent == 0: state['progress'] = state['progress'] + 1 + agg = state['found'] + + state['state'] = 'aggregating' + found = True + while found: + found = False + i = 0 + rm_len = len(raw_matches) + while not found and i < rm_len: + x = raw_matches[i] + + # Try to find an aggregated item that ends just before our ref_offset + if not found: + for a in agg: + if a['ref_no'] == x['ref_no'] and x['ref_offset'] == a['ref_offset_end']: + a['ref_offset_end'] = x['ref_offset'] + 512 + a['block_list'].append((x['disk_no'], x['disk_offset'])) + x['consumed'] = True + found = True + break + + # Try to find an aggregated item that starts just after our ref_offset + if not found: + for a in agg: + if a['ref_no'] == x['ref_no'] and x['ref_offset'] + 512 == a['ref_offset_start']: + a['ref_offset_start'] = x['ref_offset'] + a['block_list'].insert(0,(x['disk_no'], x['disk_offset'])) + x['consumed'] = True + found = True + break + + # Try to find another match right before or after the current one to create an aggregate + if not found: + for x2 in raw_matches: + if x2 != x and x2['ref_no'] == x['ref_no'] and abs(x2['ref_offset'] - x['ref_offset']) == 512: + if x2['ref_offset'] > x['ref_offset']: + first = x + last = x2 + else: + first = x2 + last = x + + a = { + 'ref_no': ref_no, + 'ref_offset_start': first['ref_offset'], + 'ref_offset_end': last['ref_offset'] + 512, + 'block_list': [ + (first['disk_no'], first['disk_offset']), + (last['disk_no'], last['disk_offset']), + ] + } + logging.info('agg.append(%s)'%a) + agg.append(a) + x['consumed'] = True + x2['consumed'] = True + found = True + break + # Orphans are not injected in agg + i += 1 + # end while ( x = raw_matches[i] ) + # Remove all consumed items from raw_matches list before next While iteration + if found == True: + raw_matches = filter(lambda x: not 'consumed' in x, raw_matches) + state['raw_matches'] = len(raw_matches) + + # end while not found + + if len(agg) < 200: + state['found'] = agg + state['state'] = 'searching' + else: + state['state'] = 'aborted' + raise Exception('Aborting after too many matches') - ref_big_hash.clear() state['state'] = 'finished' state['progress'] = 100 except Exception as e: logging.exception(e) + finally: + ref_big_hash.clear() + logging.info("Exit. do_find_files()") -- cgit v1.2.3