object-detection.py

import ctypes
import math
import os
import re
import struct
import sys
import time
import timeit
import io
import json
import numpy as np
import random
from collections import OrderedDict

from itertools import groupby

import volatility.debug as debug
import volatility.obj as obj
import volatility.plugins.linux.common as linux_common
from volatility.plugins.linux import pslist as linux_pslist
from volatility.renderers import TreeGrid
from volatility import utils

PROFILE_PATH = "./Scripts/ScriptOutputs/profile_py.txt"  # PATH TO PYTHON PROFILE
PROFILE_DATA = None

def extract_data(addr_space, num_elements, buf):
    #traverse data_ buffer
    ct = 0
    ret = []
    while (ct != num_elements):
        found_object = obj.Object("float32",
                                offset=buf,
                                vm=addr_space)
        if (ct < 3):
            print found_object.val
        if not isinstance(found_object.val, float): #invalid tensor
            return []
        else:
            ret.append(found_object.val)
        buf += 4
        ct += 1

    return ret


def find_tensors(task, addr_space, num_elements_dict, data_ptrs, amt_repeat):
    #search through heap and extract tensors with matching shapes
    heaps = []
    for vma in task.get_proc_maps(): #get heaps
        if vma.vm_name(task) == "[heap]":
            heaps.append(vma)

    tot_amt = len(data_ptrs) * amt_repeat #expected amount of tensors
    vis = set()

    weight_candidates = {}

    for heap in heaps:
        tmp = heap.vm_end / 8 * 8  #make sure divisible by 8
        end = (heap.vm_start + 7) / 8 * 8
        print "from", hex(int(tmp)), "to", hex(int(end))

        while tmp != end: #begin search
            
            found_object = obj.Object("_Tensor1",
                            offset=tmp,
                            vm=addr_space)
                 
            if (found_object.is_valid() and int(found_object.num_elements) in num_elements_dict):
                                
                for tup in num_elements_dict[int(found_object.num_elements)]:
                    name = tup[0]
                    arr = tup[1]

                    #if found_object.buf_.dereference().data_ in vis:
                    #    continue

                    shape_valid = True
                    for i in range(len(arr)):
                        if (arr[i] != int(found_object.shape[i])):
                            shape_valid = False
                            break
                    if (shape_valid):
                        data_ptrs[name].add(found_object.buf_.dereference().data_)
                        #vis.add(found_object.buf_.dereference().data_)
                        print
                        print name, "works"
                        print "num_elements", found_object.num_elements
                        print "obj_offset", hex(found_object.obj_offset)
                        print "vtable ptr (0x7fffd0d16c48L):", hex(found_object.buf_.dereference().vtable_ptr)
                        print "data_ ptr:", hex(found_object.buf_.dereference().data_)
                        print hex(tmp - end), "bytes left"

                        if name not in weight_candidates:
                            weight_candidates[name] = [extract_data(addr_space, found_object.num_elements, int(found_object.buf_.dereference().data_))]
                        else:
                            weight_candidates[name].append(extract_data(addr_space, found_object.num_elements, int(found_object.buf_.dereference().data_)))
                        
                        break
            
            tmp -= 8 #from end to beginning

    print "\ndone with extraction\n"
    
    for key in data_ptrs:
        if (key not in weight_candidates):
            weight_candidates[key] = []

    return weight_candidates


def get_avg(weights, inds):
    curr = 0.0
    for x in inds:
        curr += abs(weights[x])
    return curr / float(len(inds))


def sample(arr):
    #sample 10% of weights and get average, filter out optimizers
    for pair in arr:
        weights = pair[1]
        n = len(weights)
        if (n <= 30):
            pair[0] = get_avg(weights, range(n))
        elif (n <= 300):
            inds = random.sample(xrange(n), 30)
            pair[0] = get_avg(weights, inds)
        else:
            inds = random.sample(xrange(n), n / 10)
            inds.sort()
            pair[0] = get_avg(weights, inds)
    arr.sort(reverse=True)
    return arr


def process_file(task, file_name):
    """
    Load json with correct weights and shapes,
    aggregate them into data structures
    """
    
    addr_space = task.get_process_address_space() 

    f = open(file_name, "r")
    dump = json.load(f)

    ret = {}
    data_ptrs = {}
    shape = OrderedDict()
    
    for tensor in dump['tensors']:
        if ("gamma" in tensor or "beta" in tensor): #ignore bn layers
            continue
        print "Name:", tensor
        print "Shape:", dump['tensors'][tensor]['shape']
        shape[tensor] = dump['tensors'][tensor]['shape']

        tot = 1
        for x in dump['tensors'][tensor]['shape']:
            tot *= x
        
        if (tot not in ret):
            ret[tot] = [(tensor, dump['tensors'][tensor]['shape'])]
        else:
            ret[tot].append((tensor, dump['tensors'][tensor]['shape']))
                    
        data_ptrs[tensor] = set()
                        
    dups = {}
    tot_num_elements = 0
    for num in ret:
        tot_num_elements += num * len(ret[num])
        ret[num].sort(key=lambda x:x[1])
        mem = ret[num][0]
        for i in range(1, len(ret[num])): #detect duplicate shapes
            if (mem[1] == ret[num][i][1]):
                if (mem[0] not in dups):
                    dups[mem[0]] = [ret[num][i][0]]
                else:
                    dups[mem[0]].append(ret[num][i][0])
            else:
                mem = ret[num][i]

    print "Total elements:", tot_num_elements
    print ret #dictionary {num_elements: (model_name, shape)
    print shape #OrderedDict {model_name: shape}
    print dups #dictionary {model_name: names with identical shapes}
            
    weights = find_tensors(task, addr_space, ret, data_ptrs, 1) #hardcoded (depends on optimizers present)
            
    final = {}

    #must collect all identical tensor shapes in one pool to filter out optimizers
    for key in dups:
        pool = []
        for x in weights[key]:
            pool.append([0.0, x])
        for name in dups[key]:
            for x in weights[name]:
                pool.append([0.0, x])
        pool = sample(pool) #random samples, gets averages, and sorts by descending
        must_be_weights = [] #the greatest averages must be weights
        for i in range(len(pool) / 1): #hardcoded (depends on optimizers present)
            must_be_weights.append(pool[i][1])

        final[key] = must_be_weights
        for name in dups[key]:
            final[name] = must_be_weights

    #handle distinct tensors now
    for key in weights:
        pool = []
        if (key in final):
            continue
        for x in weights[key]:
            pool.append([0.0, x])
        pool = sample(pool)
        final[key]= []
        for i in range(len(pool) / 1):
            final[key].append(pool[i][1])
    
    print "MODEL SUMMARY"
    out_dict = {'model_name': dump['model_name'], 'num_elements': tot_num_elements, 'tensors': {}}
    for key in shape:
        print key
        print shape[key]
        if (key in final):
            curr_dict = {'shape': shape[key], 'weights': final[key]}
            out_dict['tensors'][key] = curr_dict
            print "Weights added to file"
        print
            
    with open(dump['model_name'] + "-weights.txt", "w") as f:
        json.dump(out_dict, f)

    if (len(dups) == 0):
        print "No Duplicate Tensors"
    else:
        print "Duplicate Tensors Found (weights match any of them):"
        for key in dups:
            tmp = dups[key]
            tmp.append(key)
            print tmp

    return


def _is_python_task(task, pidstr):
    """
    Checks if the task has the specified Python PID
    """
    if str(task.pid) != pidstr:
        return False
    else:
        return True


class obj_detect_weights(linux_pslist.linux_pslist):
    """
    Recovers Tensorflow model attributes from a Python process using ground truth shapes.
    Differs from cifar-10.py because it does not need to traverse GC to extract shapes.
    """
    def __init__(self, config, *args, **kwargs):
        linux_pslist.linux_pslist.__init__(self, config, *args, **kwargs)
        self._config.add_option(
            'PID', short_option = 'p', default = None,
                          help = 'Operate on the Python Process ID',
                          action = 'store', type = 'str')

    def _validate_config(self):
        if self._config.PID is not None and len(self._config.PID.split(',')) != 1:
            debug.error("Please enter the process PID")
        
    def calculate(self):
        """
        Runtime stats:
        Finding Sequential takes 5 minutes
        Brute force through heap (for tensor objects) takes: 2.1 min / 10 MB
        Total about: 15 minutes (depends on how tensors are spread out)
        """
        start = timeit.default_timer()
        linux_common.set_plugin_members(self)

        self._validate_config()
        pidstr = self._config.PID

        tasks = []
        for task in linux_pslist.linux_pslist.calculate(self):
            if _is_python_task(task, pidstr):
                tasks.append(task)

        for task in tasks:
            process_file(task, "obj_detect_mobilenetv1_0.txt")
        
        stop = timeit.default_timer()
        print("\nRuntime: {0} seconds".format(stop - start))
        sys.exit(0)

    def unified_output(self, data):
        """
        Return a TreeGrid with data to print out.
        """
        return TreeGrid([("Name", str)],
                        self.generator(data))

    def generator(self, data):
        """
        Generate data that may be formatted for printing.
        """
        for instance in data:
            yield (0, [str(instance.string)])

    def render_text(self, outfd, data):
        self.table_header(outfd, [("Dict", "70")])
        for _, output in self.generator(data):
            self.table_row(outfd, *[str(o) for o in output])