analyzer.py

import sys
import os
import re
import collections
import json
import sqlite3
import shutil
import argparse
import subprocess
import fnmatch
import uuid
from multiprocessing import Process

arg_parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument("tool_path", help="path to Unity tools")
arg_parser.add_argument("path", help="path of the asset bundle folder")
arg_parser.add_argument("-p", "--pattern", default="*", help="asset bundle search pattern")
arg_parser.add_argument("-o", "--output", default="database.db", help="name of the output database file")
arg_parser.add_argument("-k", "--keep-temp", help="keep extracted files in asset bundle folder", action='store_true')
arg_parser.add_argument("-r", "--store-raw", help="store raw json object in 'raw_objects' database table", action='store_true')
arg_parser.add_argument("-d", "--debug", help="enable pdb debugger to break when ctrl-c is pressed", action='store_true')
arg_parser.add_argument("-v", "--verbose", help="display verbose script logging", action='store_true')
args = arg_parser.parse_args()

def main():
    if sys.version_info[0] < 3:
        print("\n***** Using Python 3 is highly recommended! *****\n")

    if (args.debug == True):
        import signal
        signal.signal(signal.SIGINT, debug_signal_handler)

    if os.path.exists(args.output):
        os.remove(args.output)
    db = sqlite3.connect(args.output)

    file_index = FileIndex(db)
    processor = ObjectProcessor(file_index)
    processor.init_database(db)

    bundle_id = 0
    cursor = db.cursor()

    if os.path.isdir(args.path):
        for root, dirs, files in os.walk(args.path):
            for f in files:
                if fnmatch.fnmatch(f, args.pattern):
                    filepath = os.path.join(root, f)
                    ret_code = 0

                    # WebExtract the asset bundle.
                    run_tool_with_timeout("WebExtract", filepath, ret_code, 60, 0)

                    # Iterate extracted files.
                    datapath = filepath + "_data"
                    if ret_code == 0 and os.path.isdir(datapath):
                        bundle_id += 1
                        bundle_name = os.path.relpath(filepath, os.path.realpath(args.path))
                        bundle_size = os.path.getsize(filepath)

                        cursor.execute('''
                            INSERT INTO asset_bundles(id, name, file_size)
                                VALUES(?,?,?)
                        ''', (bundle_id, bundle_name, bundle_size))
                        db.commit()

                        debug_print("Processing " + bundle_name, 1)

                        for f2 in os.listdir(datapath):
                            datafile = os.path.join(datapath, f2)

                            run_tool("binary2text", datafile, ret_code, 2)

                            # Parse and process file.
                            if ret_code == 0 and os.path.isfile(datafile + ".txt"):
                                debug_print("Parsing " + f2, 3)
                                p = Parser(file_index);
                                objs = p.parse(datafile + ".txt")
                                processor.process_objects(bundle_id, objs, db, f2, args.store_raw)

                        if not args.keep_temp:
                            shutil.rmtree(datapath)
    else:
        print ("Path is not a directory!")

    db.close()


# Named tuple to store object fields.
ParsedField = collections.namedtuple("ParsedField", "level name value type")
Field = collections.namedtuple("Field", "type value")


class Parser(object):
    def __init__(self, file_index):
        # Current parsed field index.
        self._index = 0
        # List of parsed fields.
        self._fields = []
        # Field regex.
        self._field_regex = re.compile(r"(\S+) (\(.*?\)|\".*?\"|\S+|\s)\s?\((\S+\s?\S*)\)")
        # File index object.
        self._file_index = file_index
        # Dict mapping local file index to global unique id.
        self._external_references = {}

    def parse(self, filepath):
        """
        This function parses a file generated by binary2text.
        It returns a dictionary having this structure:
        {
            Object_ID:
            {
                "ClassID": Unity class id corresponding to the type of this object.
                "Type": Type name
                "Content": Content of this object as a nested dictionary.
            }
        }
        """

        self._external_references = {}
        # File id 0 is always the current file (we store only the base file name without .txt extension).
        self._external_references[0] = self._file_index.get_id(os.path.basename(filepath[:-4]))

        with open(filepath, 'r', encoding='UTF-8') as f:
            line = f.readline()
            # Parse external references.
            if line == "External References\n":
                while True:
                    line = f.readline()
                    if not line or line == '\n':
                        break

                    m = re.match(r"path\((\d+)\)\: \".*/(.+?)\"", line)
                    if not m:
                        raise Exception("Error in references")
                    else:
                        # Get the local file id <> global file id pair.
                        file = m.group(2)
                        local_index = int(m.group(1))
                        global_index = self._file_index.get_id(file)
                        self._external_references[local_index] = global_index

        with open(filepath, 'r', encoding='UTF-8') as f:
            data = f.read()

        # Parse the whole file, extract all objects.
        regex = re.compile(r"ID: (\-?[a-f0-9]+) \(ClassID: (\d+)\) (\w+)([\s\S]*?(?=(\n{2,}ID:|$)))")
        matches = regex.findall(data)

        objects = {}
    
        # Parse individual objects.
        for match in matches:
            try:
                self._parse_lines(match[3])
                objects[int(match[0])] = {"ClassID": int(match[1]), "Type": match[2], "Content": self._parse_obj()}
            except Exception as e:
                print("Error in " + match[0])
                self._print_error()
                raise

        return objects

    def _parse_lines(self, data):
        self._index = 0
        lines = data.splitlines()
        self._fields = []

        # Parse line by line.
        for line in lines:
            # Every line is a potential field.
            field = self._parse_line(line)
            
            # If it was a field, add it to the list.
            if field:
                self._fields.append(field)

    def _print_error(self):
        for i in range(max(0, self._index - 20), min(self._index + 20, len(self._fields))):
            print("{0} {1}".format("*" if self._index == i else " ", self._fields[i]))

    def _parse_obj(self, level=1):
        obj = {}

        # Iterate over the parsed fields.
        while self._index < len(self._fields):
            field = self._fields[self._index]

            # If the field's indentation level is lower than the current level,
            # it means we finished parsing the current object.
            if field.level < level:
                # So return it.
                return obj
            # If it's higher, than there's an error.
            elif field.level > level:
                raise Exception("Indentation error!")

            # Increment the current field index.
            self._index += 1

            # If there's no field value, it means we're starting to parse a nested object.
            if not field.value:
                if field.type == "vector":
                    # special case for vectors, size is next field.
                    size_field = self._fields[self._index]
                    self._index += 1

                    if size_field.name != "size" and size_field.type != "int":
                        raise Exception("Invalid array!")

                    size = int(size_field.value)

                    # If array is not empty, parse content.
                    if size > 0:
                        item_field = self._fields[self._index]

                        if item_field.name == "<vector data>":
                            vector = item_field.value.split(" ")
                            # Case for POD vector arrays.
                            # Get all the vector lines.
                            while True:
                                self._index += 1
                                # Break if we reached the end of <vector data> fields.
                                if self._index == len(self._fields) or self._fields[self._index].name != "<vector data>":
                                    break
                                vector.extend(self._fields[self._index].value.split(" "))

                            obj[field.name] = Field("pod_vector:" + item_field.type, [self._typecast(x, item_field.type) for x in vector])

                        else:
                            # Make sure that indentation level is one higher.
                            if item_field.level != level+1:
                                print (field)
                                print (item_field)
                                raise Exception("Error parsing array!")
                            
                            # Add vector to object.
                            vector = []
                            obj[field.name] = Field("vector", vector)

                            # No value means a nested object (Case for "data (Type)" element).
                            if not item_field.value:
                                # Iterate over array elements.
                                for i in range(size):
                                    item_field = self._fields[self._index]
                                    self._index += 1

                                    # Special case for PPtrs.
                                    if item_field.type[:5] == "PPtr<":
                                        pptr = self._parse_obj(level+2)
                                        global_index = self._external_references[pptr["m_FileID"][1]]
                                        vector.append(
                                        {
                                            item_field.name:
                                            Field("PPtr", {
                                                "GlobalFileIndex": global_index,
                                                "File": self._file_index.files[global_index],
                                                "ID": pptr["m_PathID"].value,
                                                "Type": item_field.type[5:-1], # Remove PPtr<>
                                            })
                                        })
                                    else:
                                        # Otherwise, recursively parse the nested object.
                                        vector.append({item_field.name: Field(item_field.type, self._parse_obj(level+2))})
                            else:
                                # Case for simple types.
                                for i in range(size):
                                    field = self._fields[self._index]
                                    self._index += 1
                                    vector.append(Field(field.type, self._typecast(field.value, field.type)))
                    else:
                        # Store empty array
                        obj[field.name] = Field("vector", [])
                elif field.type[:5] == "PPtr<":
                    # Special case for PPtrs.
                    pptr = self._parse_obj(level+1)
                    global_index = self._external_references[pptr["m_FileID"][1]]
                    obj[field.name] = Field("PPtr", {
                        "GlobalFileIndex": global_index,
                        "File": self._file_index.files[global_index],
                        "ID": pptr["m_PathID"].value,
                        "Type": field.type[5:-1], # Remove PPtr<>
                    })
                else:
                    field_name = field.name
                    
                    # Special case because for some reason, lists are not always serialized as vectors.
                    if field_name in obj:
                        i = 0
                        while "{0}/{1}".format(field_name, i) in obj:
                            i += 1
                        field_name = "{0}/{1}".format(field_name, i)

                    # Recursively parse nested object.
                    obj[field_name] = Field(field.type, self._parse_obj(level+1))
            else:
                # Special case for Vertex Data not having the same syntax.
                if field.name != "<vector data>":
                    # Simple type, just cast it.
                    obj[field.name] = Field(field.type, self._typecast(field.value, field.type))

        return obj

    def to_float(self, value):
        try:
            return float(value)
        except ValueError:
            return float("nan")

    def _typecast(self, value, typename):
        # Converts a string to the right data type.

        if typename == "string":
            try:
                # That kind of unescape the string, but it doesn't work all the time.
                return json.loads(value)
            except:
                return value
        elif typename in ["int", "unsigned int", "SInt64", "UInt64", "SInt32", "UInt32", "SInt16", "UInt16", "SInt8", "UInt8"]:
            return int(value)
        elif typename == "float" or typename == "double":
            return self.to_float(value)
        elif typename == "Vector4f":
            match = re.match(r"\((\S+) (\S+) (\S+) (\S+)\)", value)
            return [
                Field("float", self.to_float(match.group(1))),
                Field("float", self.to_float(match.group(2))),
                Field("float", self.to_float(match.group(3))),
                Field("float", self.to_float(match.group(4))),
            ]
        elif typename == "Vector3f":
            match = re.match(r"\((\S+) (\S+) (\S+)\)", value)
            return [
                Field("float", self.to_float(match.group(1))),
                Field("float", self.to_float(match.group(2))),
                Field("float", self.to_float(match.group(3))),
            ]
        elif typename == "Vector2f":
            match = re.match(r"\((\S+) (\S+)\)", value)
            return [
                Field("float", self.to_float(match.group(1))),
                Field("float", self.to_float(match.group(2))),
            ]
        elif typename == "bool":
            return int(value) == 1
        elif typename == "ColorRGBA":
            match = re.match(r"\((\S+) (\S+) (\S+) (\S+)\)", value)
            return [
                Field("float", self.to_float(match.group(1))),
                Field("float", self.to_float(match.group(2))),
                Field("float", self.to_float(match.group(3))),
                Field("float", self.to_float(match.group(4))),
            ]
        elif typename == "GUID":
            return uuid.UUID(value)
        else:
            return value

    def _parse_line(self, line):
        # Parse a line an generate a Field tuple.

        if not line or len(line) == 1:
            return None

        # Determine indentation level (count number of tabs).
        level = 0
        while level < len(line) and line[level] == '\t':
            level += 1
        match = self._field_regex.match(line[level:])

        if match:
            return ParsedField(level, match.group(1), match.group(2) if match.group(2) != " " else None, match.group(3))

        # Try to parse array data.
        match = re.match(r"data \s?\((\S+\s?\S*)\) #\d+: (.*)", line[level:])
        if match:
            return ParsedField(level, "<vector data>", match.group(2), match.group(1))

        return None


class ObjectProcessor(object):
    def __init__(self, file_index):
        
        # Dict mapping class id to type name.
        self._types = {}
        self._id_generator = IdGenerator()

        # File index mapping an extracted file name to a unique id.
        self._file_index = file_index

        # Type handler functions to extract type specific information.
        self._type_handlers = {
            "Mesh": MeshHandler(self._id_generator, self._file_index),
            "Texture2D": Texture2DHandler(self._id_generator, self._file_index),
            "Shader": ShaderHandler(self._id_generator, self._file_index),
            "AnimationClip": AnimationClipHandler(self._id_generator, self._file_index),
            "AudioClip": AudioClipHandler(self._id_generator, self._file_index),
            "AssetBundle": AssetBundleHandler(self._id_generator, self._file_index)
        }
        self._default_handler = DefaultHandler(self._id_generator, self._file_index)

    def process_objects(self, bundle_id, objs, db, file, store_raw):
        cursor = db.cursor()
        file_id = self._file_index.get_id(file)

        count = 0
        error_count = 0
        
        # Iterate on objects.
        for object_id, obj in objs.items():
            count += 1

            # Get unique id for this object.
            current_id = self._id_generator.get_id(file_id, object_id)

            cursor.execute("SELECT * FROM objects WHERE id = ?", (current_id,))
            rows = cursor.fetchall()

            if len(rows) != 0:
                error_count += 1
                print("Ignoring duplicate object {0}!".format(object_id))
                if error_count > 10:
                    print("Too many duplicate objects, aborting file {0}!".format(file))
                    print("Are you trying to analyze multiple variants of the same bundles? You should analyze one set of bundles at a time.")
                    break
                continue

            class_id = obj["ClassID"]
            typename = obj["Type"]

            # If there's a m_GameObject PPtr field, get the id of the game object.
            game_object_pptr = obj["Content"].get("m_GameObject")
            game_object_id = None if not game_object_pptr else self._id_generator.get_id(game_object_pptr.value["GlobalFileIndex"], game_object_pptr.value["ID"])

            # Call type specific handler.
            handler = self._type_handlers.get(typename, self._default_handler)
            name, size, references, serialized_fields = handler.process(current_id, obj["Content"], cursor, bundle_id)

            # Add type if new.
            if class_id not in self._types:
                self._add_type(cursor, class_id, typename)
            elif typename != self._types[class_id]:
                raise Exception("ClassID and Type mismatch!")

            # Add object to database.
            cursor.execute('''
                INSERT INTO objects(id, file, object_id, bundle_id, class_id, name, game_object, size, serialized_fields)
                    VALUES(?,?,?,?,?,?,?,?,?)
            ''', (current_id, file_id, object_id, bundle_id, class_id, name, game_object_id, size, serialized_fields))
            
            # Also store the JSON object if requested.
            if store_raw:
                json_obj = json.dumps(obj, indent=2)
                cursor.execute('''
                    INSERT INTO raw_objects(id, data, data_size)
                        VALUES(?,?,?)
                ''', (current_id, json_obj, len(json_obj)))

            # Store references.
            for ref in references:
                cursor.execute('''
                    INSERT INTO refs(referrer_id, referee_id, field, type, built_in)
                        VALUES(?,?,?,?,?)
                ''', (current_id, ref[0], ref[1], ref[2], ref[3]))
        db.commit()

        if(args.verbose == True):
            debug_print("{0} objects".format( count ), 4)

    def _add_type(self, cursor, class_id, typename):
        cursor.execute('''
            INSERT INTO types(class_id, name)
                VALUES(?,?)
        ''', (class_id, typename))
        self._types[class_id] = typename

    def init_database(self, db):
        cursor = db.cursor()

        ##### Types #####
        cursor.execute('''
            CREATE TABLE types(
                class_id INTEGER,
                name TEXT,
                PRIMARY KEY (class_id)
            )
        ''')

        ##### Asset Bundles #####
        cursor.execute('''
            CREATE TABLE asset_bundles(
                id INTEGER,
                name TEXT,
                file_size INTEGER,
                PRIMARY KEY (id)
            )
        ''')

        ##### Objects #####
        # Note: The natural primary key for objects is a composite of file + object_id, but
        # this complexifies all queries so a unique integer id is used instead.
        cursor.execute('''
            CREATE TABLE objects(
                id INTEGER,
                file INTEGER,
                object_id INTEGER,
                bundle_id INTEGER,
                class_id INTEGER,
                name TEXT,
                game_object INTEGER,
                size INTEGER,
                serialized_fields INTEGER,
                PRIMARY KEY (id)
                FOREIGN KEY (class_id) REFERENCES types(class_id)
                FOREIGN KEY (bundle_id) REFERENCES asset_bundles(id)
            )
        ''')
        cursor.execute('''
            CREATE VIEW object_view AS
            SELECT objects.id, objects.object_id, asset_bundles.name AS bundle, files.name AS file, objects.class_id, types.name AS type, objects.name, objects.game_object, objects.size,
            CASE 
                WHEN size < 1024 THEN printf("%!5.1f B", size * 1.0)
                WHEN size >=  1024 AND size < (1024 * 1024) THEN printf("%!5.1f KB", size / 1024.0)
                WHEN size >= (1024 * 1024)  AND size < (1024 * 1024 * 1024) THEN printf("%!5.1f MB", size / 1024.0 / 1024)
                WHEN size >= (1024 * 1024 * 1024) THEN printf("%!5.1f GB", size / 1024.0 / 1024 / 1024)
            END AS pretty_size,
            objects.serialized_fields
            FROM objects
            INNER JOIN types ON objects.class_id = types.class_id
            INNER JOIN files ON objects.file = files.id
            INNER JOIN asset_bundles ON objects.bundle_id = asset_bundles.id
        ''')
        cursor.execute('''
            CREATE TABLE raw_objects(
                id INTEGER,
                data TEXT,
                data_size INTEGER,
                PRIMARY KEY (id)
                FOREIGN KEY (id) REFERENCES objects(id)
            )
        ''')

        ##### References #####
        cursor.execute('''
            CREATE TABLE refs(
                referrer_id INTEGER,
                referee_id INTEGER,
                field TEXT,
                type TEXT,
                built_in INTEGER,
                PRIMARY KEY (referrer_id, referee_id, field)
                FOREIGN KEY (referrer_id) REFERENCES objects(id)
                FOREIGN KEY (referee_id) REFERENCES objects(id)
            )
        ''')
        cursor.execute('''
            CREATE INDEX idx_refs_referrers ON refs (referrer_id)
        ''')
        cursor.execute('''
            CREATE INDEX idx_refs_referees ON refs (referee_id)
        ''')

        ##### Views #####
        cursor.execute('''
            CREATE VIEW view_breakdown_by_type AS
            SELECT *,
            CASE
                WHEN byte_size < 1024 THEN printf("%!5.1f B", byte_size * 1.0)
                WHEN byte_size >=  1024 AND byte_size < (1024 * 1024) THEN printf("%!5.1f KB", byte_size / 1024.0)
                WHEN byte_size >= (1024 * 1024)  AND byte_size < (1024 * 1024 * 1024) THEN printf("%!5.1f MB", byte_size / 1024.0 / 1024)
                WHEN byte_size >= (1024 * 1024 * 1024) THEN printf("%!5.1f GB", byte_size / 1024.0 / 1024 / 1024)
            END AS pretty_size
            FROM
            (SELECT type, count(*) AS count, sum(size) AS byte_size
            FROM object_view AS o
            GROUP BY type
            ORDER BY byte_size DESC, count DESC)
        ''')
        # TODO: add other exception (components)
        cursor.execute('''
            CREATE VIEW view_potential_duplicates AS
            SELECT COUNT(DISTINCT bundle) AS instances,
            CASE
                WHEN sum(size) < 1024 THEN printf("%!5.1f B", sum(size) * 1.0)
                WHEN sum(size) >=  1024 AND sum(size) < (1024 * 1024) THEN printf("%!5.1f KB", sum(size) / 1024.0)
                WHEN sum(size) >= (1024 * 1024)  AND sum(size) < (1024 * 1024 * 1024) THEN printf("%!5.1f MB", sum(size) / 1024.0 / 1024)
                WHEN sum(size) >= (1024 * 1024 * 1024) THEN printf("%!5.1f GB", sum(size) / 1024.0 / 1024 / 1024)
            END AS pretty_size,
            sum(size) AS size, name, type, REPLACE(GROUP_CONCAT(DISTINCT bundle), ",", CHAR(13)) AS in_bundles
            FROM object_view
            WHERE size > 0 AND NOT (type = "Texture2D" AND name LIKE ("Lightmap%_comp_light"))
            GROUP BY name, type, size
            HAVING instances > 1
            ORDER BY size DESC, instances DESC
        ''')
        cursor.execute('''
            CREATE VIEW view_references_to_default_material AS
            SELECT go.bundle AS bunlde, go.name AS game_object FROM objects m
            INNER JOIN refs r ON r.referee_id = m.id
            INNER JOIN objects o ON o.id = r.referrer_id
            INNER JOIN object_view go ON o.game_object = go.id
            WHERE m.name = "Default-Material"
        ''')
        cursor.execute('''
            CREATE VIEW asset_bundle_view AS
            SELECT
                ab.id,
                ab.name,
                ab.file_size,
                CASE
                    WHEN ab.file_size < 1024 THEN printf("%!5.1f B", ab.file_size * 1.0)
                    WHEN ab.file_size >=  1024 AND ab.file_size < (1024 * 1024) THEN printf("%!5.1f KB", ab.file_size / 1024.0)
                    WHEN ab.file_size >= (1024 * 1024)  AND ab.file_size < (1024 * 1024 * 1024) THEN printf("%!5.1f MB", ab.file_size / 1024.0 / 1024)
                    WHEN ab.file_size >= (1024 * 1024 * 1024) THEN printf("%!5.1f GB", ab.file_size / 1024.0 / 1024 / 1024)
                END AS pretty_file_size,
                sum(o.size) as uncompressed_size,
                CASE
                    WHEN sum(o.size) < 1024 THEN printf("%!5.1f B", sum(o.size) * 1.0)
                    WHEN sum(o.size) >=  1024 AND sum(o.size) < (1024 * 1024) THEN printf("%!5.1f KB", sum(o.size) / 1024.0)
                    WHEN sum(o.size) >= (1024 * 1024)  AND sum(o.size) < (1024 * 1024 * 1024) THEN printf("%!5.1f MB", sum(o.size) / 1024.0 / 1024)
                    WHEN sum(o.size) >= (1024 * 1024 * 1024) THEN printf("%!5.1f GB", sum(o.size) / 1024.0 / 1024 / 1024)
                END AS pretty_uncompressed_size,
                sum(o.size)*1.0 / ab.file_size as compression_ratio
            FROM asset_bundles ab
            INNER JOIN objects o ON o.bundle_id = ab.id
            GROUP BY ab.name
        ''')
        cursor.execute('''
            CREATE VIEW material_view AS
            SELECT m.bundle, m.name material_name, s.name shader_name
            FROM object_view m
            INNER JOIN refs r ON r.referrer_id = m.id
            INNER JOIN object_view s ON s.id = r.referee_id
            WHERE m.type = "Material" AND s.type = "Shader"
        ''')

        for h in self._type_handlers.values():
            h.init_database(cursor)

        db.commit()


# Base type handler class.
class BaseHandler(object):
    def __init__(self, id_generator, file_index):
        self._id_generator = id_generator
        self._file_index = file_index

    def process(self, current_id, obj, cursor, bundle_id):
        # Returns a tuple:
        # (name, size, references, field_count)
        # See _recursive_process for details.
        #
        # Should be implemented by sub classes.
        raise NotImplementedError()

    def init_database(self, cursor):
        # Initialize database structures needed by this type.
        #
        # Should be implemented by sub classes.
        raise NotImplementedError()

    def _recursive_process(self, obj, field_path):
        # This function recursively analyze the object.
        # It returns a tuple (size, references, field_count)
        # References are stored in a list of tuples (referee_id, field_path, referee_type, is_built_in)
        # The field_path looks like this: m_ClipBindingConstant:genericBindings[3]:data:script
        # is_build_in is true if the referee is a built in resource.

        count = 0
        size = 0
        references = []

        try:
            if type(obj) is dict:
                if field_path:
                    # Separate fields with ":""
                    field_path += ":"
                for k, v in obj.items():
                    # Recursively process object.
                    s, r, c = self._recursive_process(v, field_path + k)
                    count += 1 + c
                    size += s
                    references.extend(r)
            elif type(obj) is list:
                # Recursively iterate over objects in list.
                for i in range(len(obj)):
                    v = obj[i]
                    s, r, c = self._recursive_process(v, "{0}[{1}]".format(field_path, i))
                    count += c
                    size += s
                    references.extend(r)
            elif type(obj) is Field:
                if obj.type == "PPtr":
                    v = obj.value
                    object_id = v["ID"]
                    # Only add reference if PPtr is not null
                    if object_id:
                        file_id = v["GlobalFileIndex"]
                        references.append((
                            self._id_generator.get_id(file_id, object_id),
                            field_path,
                            v["Type"],
                            self._file_index.is_built_in(file_id),
                        ))
                    count += 1
                    size += 12 # Size of PPtr?
                elif obj.type.startswith("pod_vector:"):
                    size += BaseHandler._get_size(obj)
                elif type(obj.value) is dict or type(obj.value) is list:
                    # Recursively process object.
                    s, r, c = self._recursive_process(obj.value, field_path)
                    count += 1 + c
                    size += s
                    references.extend(r)
                else:
                    size += BaseHandler._get_size(obj)
            else:
                raise Exception("Unexpected type encountered when processing object: " + type(obj).__name__)
        except:
            print (obj)
            raise

        return (size, references, count)

    @staticmethod
    def _get_size(field):
        size_map = {
            "int": 4,
            "unsigned int": 4,
            "SInt64": 8,
            "UInt64": 8,
            "SInt32": 4,
            "UInt32": 4,
            "SInt16": 2,
            "UInt16": 2,
            "SInt8": 1,
            "UInt8": 1,
            "char": 1,
            "float": 4,
            "double": 8,
            "Vector4f": 16,
            "Vector3f": 12,
            "Vector2f": 8,
            "bool": 1, # TODO: check real serialized size of bool
            "GUID": 16,
        }
        size = size_map.get(field.type)
        if size is None:
            if field.type == "string":
                size = len(field.value)
            elif field.type.startswith("pod_vector:"):
                size = len(field.value) * size_map[field.type[11:]]
                return size
            else:
                print("Warning: unhandled type {0}!".format(field.type))
                return 0
        return size


class DefaultHandler(BaseHandler):
    def __init__(self, id_generator, file_index):
        super(DefaultHandler, self).__init__(id_generator, file_index)

    def process(self, current_id, obj, cursor, bundle_id):
        name = obj.get("m_Name")
        name = "" if not name else name.value

        return (name,) + self._recursive_process(obj, "")

    def init_database(self, cursor):
        pass


class MeshHandler(BaseHandler):
    def __init__(self, id_generator, file_index):
        super(MeshHandler, self).__init__(id_generator, file_index)

    def process(self, current_id, obj, cursor, bundle_id):
        name = obj["m_Name"].value
        compression = obj["m_MeshCompression"].value
        rw_enabled = obj["m_IsReadable"].value

        vertex_size = 0

        if compression == 0:
            # Index buffer size depends on the format which is either 16 or 32 bits per index.
            index_buffer_size = BaseHandler._get_size(obj["m_IndexBuffer"])
            bytes_per_index = 2 if obj["m_IndexFormat"].value == 0 else 4
            indices = index_buffer_size / bytes_per_index
            vertices = obj["m_VertexData"].value["m_VertexCount"].value

            # Ugly hack because vertex data is stored in a way that doesn't respect the syntax of the document.
            vertex_size = obj["m_VertexData"].value["size"].value

            # If mesh data is in a stream file, the size is 0.
            if (vertex_size == 0):
                # Get the size of streamed data instead.
                vertex_size = obj["m_StreamData"].value["size"].value

        else:
            compressed_mesh = obj["m_CompressedMesh"].value
            vertices = compressed_mesh["m_Vertices"].value["m_NumItems"].value / 3
            indices = compressed_mesh["m_Triangles"].value["m_NumItems"].value
            
        cursor.execute('''
            INSERT INTO meshes(id, indices, vertices, compression, rw_enabled)
                VALUES(?,?,?,?,?)
        ''', (current_id, indices, vertices, compression, rw_enabled))

        size, references, field_count = self._recursive_process(obj, "")
        return (name, size + vertex_size, references, field_count)

    def init_database(self, cursor):
        cursor.execute('''
        CREATE TABLE meshes(
            id INTEGER,
            indices INTEGER,
            vertices INTEGER,
            compression INTEGER,
            rw_enabled INTEGER,
            PRIMARY KEY (id),
            FOREIGN KEY (id) references objects(id)
        )
        ''')
        cursor.execute('''
            CREATE VIEW mesh_view AS
            SELECT
                object_view.*,
                meshes.indices,
                meshes.vertices,
                meshes.compression,
                meshes.rw_enabled
            FROM object_view INNER JOIN meshes ON object_view.id = meshes.id
        ''')
        cursor.execute('''
            CREATE VIEW view_rw_meshes AS
            SELECT * FROM mesh_view WHERE rw_enabled = 1
        ''')


class Texture2DHandler(BaseHandler):
    def __init__(self, id_generator, file_index):
        super(Texture2DHandler, self).__init__(id_generator, file_index)

    def process(self, current_id, obj, cursor, bundle_id):
        name = obj["m_Name"].value
        size = obj["m_CompleteImageSize"].value
        format = obj["m_TextureFormat"].value
        rw_enabled = obj["m_IsReadable"].value
        mip_count = obj["m_MipCount"].value

        cursor.execute('''
            INSERT INTO textures(id, width, height, format, rw_enabled, mip_count)
                VALUES(?,?,?,?,?,?)
        ''', (current_id, obj["m_Width"].value, obj["m_Height"].value, format, rw_enabled, mip_count))

        _, references, field_count = self._recursive_process(obj, "")

        return (name, size, references, field_count)

    def init_database(self, cursor):
        texture_formats = [
            (0, "None"),
            (1, "Alpha8"),
            (2, "ARGB4444"),
            (3, "RGB24"),
            (4, "RGBA32"),
            (5, "ARGB32"),
            (6, "ARGBFloat"),
            (7, "RGB565"),
            (8, "BGR24"),
            (9, "AlphaLum16"),
            (10, "DXT1"),
            (11, "DXT3"),
            (12, "DXT5"),
            (13, "RGBA4444"),
            (14, "BGRA32"),
            (15, "RHalf"),
            (16, "RGHalf"),
            (17, "RGBAHalf"),
            (18, "RFloat"),
            (19, "RGFloat"),
            (20, "RGBAFloat"),
            (21, "YUY2"),
            (22, "RGB9e5Float"),
            (23, "RGBFloat"),
            (24, "BC6H"),
            (25, "BC7"),
            (26, "BC4"),
            (27, "BC5"),
            (28, "DXT1Crunched"),
            (29, "DXT5Crunched"),
            (30, "PVRTC_RGB2"),
            (31, "PVRTC_RGBA2"),
            (32, "PVRTC_RGB4"),
            (33, "PVRTC_RGBA4"),
            (34, "ETC_RGB4"),
            (35, "ATC_RGB4"),
            (36, "ATC_RGBA8"),
            (41, "EAC_R"),
            (42, "EAC_R_SIGNED"),
            (43, "EAC_RG"),
            (44, "EAC_RG_SIGNED"),
            (45, "ETC2_RGB"),
            (46, "ETC2_RGBA1"),
            (47, "ETC2_RGBA8"),
            (48, "ASTC_RGB_4x4"),
            (49, "ASTC_RGB_5x5"),
            (50, "ASTC_RGB_6x6"),
            (51, "ASTC_RGB_8x8"),
            (52, "ASTC_RGB_10x10"),
            (53, "ASTC_RGB_12x12"),
            (54, "ASTC_RGBA_4x4"),
            (55, "ASTC_RGBA_5x5"),
            (56, "ASTC_RGBA_6x6"),
            (57, "ASTC_RGBA_8x8"),
            (58, "ASTC_RGBA_10x10"),
            (59, "ASTC_RGBA_12x12"),
            (60, "ETC_RGB4_3DS"),
            (61, "ETC_RGBA8_3DS"),
            (62, "RG16"),
            (63, "R8"),
            (64, "ETC_RGB4Crunched"),
            (65, "ETC2_RGBA8Crunched"),
        ]
        cursor.execute('''
            CREATE TABLE texture_formats(
                id INTEGER,
                format TEXT,
                PRIMARY KEY (id)
            )
        ''')
        cursor.execute('''
            CREATE TABLE textures(
                id INTEGER,
                width INTEGER,
                height INTEGER,
                format INTEGER,
                mip_count INTEGER,
                rw_enabled INTEGER,
                PRIMARY KEY (id)
                FOREIGN KEY (id) references objects(id)
                FOREIGN KEY (format) references texture_formats(id)
            )
        ''')
        cursor.execute('''
            CREATE VIEW texture_view AS
            SELECT
                object_view.*,
                texture_formats.format,
                textures.width,
                textures.height,
                textures.mip_count,
                textures.rw_enabled
            FROM object_view
            INNER JOIN textures ON object_view.id = textures.id
            LEFT JOIN texture_formats ON textures.format = texture_formats.id
        ''')
        cursor.executemany('''
            INSERT INTO texture_formats values (?,?)
        ''', texture_formats)
        cursor.execute('''
            CREATE VIEW view_rw_textures AS
            SELECT * FROM texture_view WHERE rw_enabled = 1
        ''')
        cursor.execute('''
            CREATE VIEW view_mipmapped_textures AS
            SELECT * FROM texture_view WHERE mip_count > 1
        ''')


class ShaderHandler(BaseHandler):
    def __init__(self, id_generator, file_index):
        super(ShaderHandler, self).__init__(id_generator, file_index)

    def process(self, current_id, obj, cursor, bundle_id):
        name = obj["m_ParsedForm"].value["m_Name"].value
        properties = len(obj["m_ParsedForm"].value["m_PropInfo"].value["m_Props"].value)
        sub_shaders = obj["m_ParsedForm"].value["m_SubShaders"].value
        total_subprograms = 0
        unique_progs = set()
        unique_keywords = set()

        # Count number of sub shaders and sub programs.
        for ss in sub_shaders:
            passes = ss["data"].value["m_Passes"].value
            pass_num = 0
            for p in passes:
                names = {}
                for k, n in p["data"].value["m_NameIndices"].value.items():
                    if ("data" in k):
                        names[n.value["second"].value] = n.value["first"].value
                
                # Process vertex sub programs.
                sub_programs = p["data"].value["progVertex"].value["m_SubPrograms"].value
                sp_num = 0
                for sp in sub_programs:
                    hw_tier = sp["data"].value["m_ShaderHardwareTier"].value
                    gpu_program_type = sp["data"].value["m_GpuProgramType"].value
                    
                    if "m_KeywordIndices" in sp["data"].value:
                        keywords = [names[kwi] for kwi in sp["data"].value["m_KeywordIndices"].value]
                    else:
                        keywords = [names[kwi] for kwi in sp["data"].value["m_GlobalKeywordIndices"].value]
                        keywords.extend([names[kwi] for kwi in sp["data"].value["m_LocalKeywordIndices"].value])
                    
                    unique_progs.add(sp["data"].value["m_BlobIndex"].value)
                    unique_keywords.update(keywords)

                    cursor.execute('''
                        INSERT INTO shader_subprograms(shader, pass, subprogram, hw_tier, prog_type, type, keywords)
                            VALUES(?,?,?,?,?,?,?)
                    ''', (current_id, pass_num, sp_num, hw_tier, "vertex", gpu_program_type, ", ".join(keywords)))

                    sp_num += 1
                    total_subprograms += 1

                # Process fragment sub programs.
                sub_programs = p["data"].value["progFragment"].value["m_SubPrograms"].value
                sp_num = 0
                for sp in sub_programs:
                    hw_tier = sp["data"].value["m_ShaderHardwareTier"].value
                    gpu_program_type = sp["data"].value["m_GpuProgramType"].value
                    
                    if "m_KeywordIndices" in sp["data"].value:
                        keywords = [names[kwi] for kwi in sp["data"].value["m_KeywordIndices"].value]
                    else:
                        keywords = [names[kwi] for kwi in sp["data"].value["m_GlobalKeywordIndices"].value]
                        keywords.extend([names[kwi] for kwi in sp["data"].value["m_LocalKeywordIndices"].value])
                        
                    unique_progs.add(sp["data"].value["m_BlobIndex"].value)
                    unique_keywords.update(keywords)

                    cursor.execute('''
                        INSERT INTO shader_subprograms(shader, pass, subprogram, hw_tier, prog_type, type, keywords)
                            VALUES(?,?,?,?,?,?,?)
                    ''', (current_id, pass_num, sp_num, hw_tier, "fragment", gpu_program_type, ", ".join(keywords)))

                    sp_num += 1
                    total_subprograms += 1

                pass_num += 1

        cursor.execute('''
            INSERT INTO shaders(id, properties, sub_shaders, passes, sub_programs, unique_programs, keywords)
                VALUES(?,?,?,?,?,?,?)
        ''', (current_id, properties, len(sub_shaders), pass_num, total_subprograms, len(unique_progs), ", ".join(sorted(unique_keywords))))

        return (name,) + self._recursive_process(obj, "")

    def init_database(self, cursor):
        program_types = [
            ("Unknown", 0),
            ("GLLegacy_Removed", 1),
            ("GLES31AEP", 2),
            ("GLES31", 3),
            ("GLES3", 4),
            ("GLES", 5),
            ("GLCore32", 6),
            ("GLCore41", 7),
            ("GLCore43", 8),
            ("DX9VertexSM20_Removed", 9),
            ("DX9VertexSM30_Removed", 10),
            ("DX9PixelSM20_Removed", 11),
            ("DX9PixelSM30_Removed", 12),
            ("DX10Level9Vertex_Removed", 13),
            ("DX10Level9Pixel_Removed", 14),
            ("DX11VertexSM40", 15),
            ("DX11VertexSM50", 16),
            ("DX11PixelSM40", 17),
            ("DX11PixelSM50", 18),
            ("DX11GeometrySM40", 19),
            ("DX11GeometrySM50", 20),
            ("DX11HullSM50", 21),
            ("DX11DomainSM50", 22),
            ("MetalVS", 23),
            ("MetalFS", 24),
            ("SPIRV", 25),
            ("ConsoleVS", 26),
            ("ConsoleFS", 27),
            ("ConsoleHS", 28),
            ("ConsoleDS", 29),
            ("ConsoleGS", 30),
        ]
        cursor.execute('''
            CREATE TABLE gpu_program_types(
                id INTEGER,
                type TEXT,
                PRIMARY KEY (id)
            )
        ''')

        cursor.executemany('''
            INSERT INTO gpu_program_types (type, id) values (?,?)
        ''', program_types)
        cursor.execute('''
            CREATE TABLE shaders(
                id INTEGER,
                properties INTEGER,
                sub_shaders INTEGER,
                passes INTEGER,
                sub_programs INTEGER,
                unique_programs INTEGER,
                keywords TEXT,
                PRIMARY KEY (id),
                FOREIGN KEY (id) references objects(id)
            )
        ''')
        cursor.execute('''
            CREATE TABLE shader_subprograms(
                shader INTEGER,
                pass INTEGER,
                subprogram INTEGER,
                hw_tier INTEGER,
                prog_type TEXT,
                type INTEGER,
                keywords TEXT,
                FOREIGN KEY (shader) references shaders(id),
                FOREIGN KEY (type) references gpu_program_types(id)
            )
        ''')
        cursor.execute('''
            CREATE VIEW shader_view AS
            SELECT
                object_view.*,
                shaders.properties,
                shaders.sub_shaders,
                shaders.passes,
                shaders.sub_programs,
                shaders.unique_programs,
                shaders.keywords
            FROM object_view INNER JOIN shaders ON object_view.id = shaders.id
        ''')
        cursor.execute('''
            CREATE VIEW view_breakdown_shaders AS
            SELECT name, count(*) AS instances,
            CASE
                WHEN sum(size) < 1024 THEN printf("%!5.1f B", sum(size) * 1.0)
                WHEN sum(size) >=  1024 AND sum(size) < (1024 * 1024) THEN printf("%!5.1f KB", sum(size) / 1024.0)
                WHEN sum(size) >= (1024 * 1024)  AND sum(size) < (1024 * 1024 * 1024) THEN printf("%!5.1f MB", sum(size) / 1024.0 / 1024)
                WHEN sum(size) >= (1024 * 1024 * 1024) THEN printf("%!5.1f GB", sum(size) / 1024.0 / 1024 / 1024)
            END AS pretty_total_size,
            sum(size) AS total_size, GROUP_CONCAT(bundle, CHAR(13)) AS in_bundles
            FROM shader_view
            GROUP BY name
            ORDER BY total_size DESC, instances DESC
        ''')
        cursor.execute('''
            CREATE VIEW shader_subprogram_view AS
            SELECT s.*, pt.type AS api_type, sp.pass, sp.hw_tier, sp.prog_type, sp.keywords AS prog_keywords FROM shader_view s
            LEFT JOIN shader_subprograms sp ON s.id = sp.shader
            LEFT JOIN gpu_program_types pt ON pt.id = sp.type
        ''')


class AudioClipHandler(BaseHandler):
    def __init__(self, id_generator, file_index):
        super(AudioClipHandler, self).__init__(id_generator, file_index)

    def process(self, current_id, obj, cursor, bundle_id):
        name = obj["m_Name"].value
        size = obj["m_Resource"].value["m_Size"].value

        cursor.execute('''
            INSERT INTO audio_clips(id, bits_per_sample, frequency, channels, load_type, format)
                VALUES(?,?,?,?,?,?)
        ''', (
            current_id,
            obj["m_BitsPerSample"].value,
            obj["m_Frequency"].value,
            obj["m_Channels"].value,
            obj["m_LoadType"].value,
            obj["m_CompressionFormat"].value)
        )

        _, references, field_count = self._recursive_process(obj, "")

        return (name, size, references, field_count)

    def init_database(self, cursor):
        audio_load_types = [
            (0, "Decompress on Load"),
            (1, "Compressed in Memory"),
            (2, "Streaming"),
        ]
        audio_formats = [
            (0, "PCM"),
            (1, "Vorbis"),
            (2, "ADPCM"),
            (3, "MP3"),
            (4, "PSMVAG"),
            (5, "HEVAG"),
            (6, "XMA"),
            (7, "AAC"),
            (8, "GCADPCM"),
            (9, "ATRAC9"),
        ]
        cursor.execute('''
            CREATE TABLE audio_load_types(
                id INTEGER,
                type TEXT,
                PRIMARY KEY (id)
            )
        ''')
        cursor.execute('''
            CREATE TABLE audio_formats(
                id INTEGER,
                format TEXT,
                PRIMARY KEY (id)
            )
        ''')
        cursor.execute('''
            CREATE TABLE audio_clips(
                id INTEGER,
                bits_per_sample INTEGER,
                frequency INTEGER,
                channels INTEGER,
                load_type INTEGER,
                format INTEGER,
                PRIMARY KEY (id),
                FOREIGN KEY (id) references objects(id)
                FOREIGN KEY (load_type) references audio_load_types(load_type)
                FOREIGN KEY (format) references audio_formats(format)
            )
        ''')
        cursor.execute('''
            CREATE VIEW audio_clip_view AS
            SELECT
                object_view.*,
                audio_clips.bits_per_sample,
                audio_clips.frequency,
                audio_clips.channels,
                audio_load_types.type,
                audio_formats.format
            FROM object_view
            INNER JOIN audio_clips ON object_view.id = audio_clips.id
            LEFT JOIN audio_load_types ON audio_clips.load_type = audio_load_types.id
            LEFT JOIN audio_formats ON audio_clips.format = audio_formats.id
        ''')
        cursor.executemany('''
            INSERT INTO audio_load_types values (?,?)
        ''', audio_load_types)
        cursor.executemany('''
            INSERT INTO audio_formats values (?,?)
        ''', audio_formats)
        cursor.execute('''
            CREATE VIEW view_suspicious_audio_clips AS
            SELECT name, size, bits_per_sample, frequency, channels, type, format, bundle FROM audio_clip_view
            WHERE (type = "Streaming" AND size < 1024*1024) OR (type <> "Streaming" AND size > 1024*1024)
        ''')


class AnimationClipHandler(BaseHandler):
    def __init__(self, id_generator, file_index):
        super(AnimationClipHandler, self).__init__(id_generator, file_index)

    def process(self, current_id, obj, cursor, bundle_id):
        name = obj["m_Name"].value
        legacy = obj["m_Legacy"].value

        cursor.execute('''
            INSERT INTO animation_clips(id, legacy)
                VALUES(?,?)
        ''', (
            current_id,
            legacy)
        )

        return (name,) + self._recursive_process(obj, "")

    def init_database(self, cursor):
        cursor.execute('''
            CREATE TABLE animation_clips(
                id INTEGER,
                legacy INTEGER,
                PRIMARY KEY (id),
                FOREIGN KEY (id) references objects(id)
            )
        ''')
        cursor.execute('''
            CREATE VIEW animation_view AS
            SELECT
                object_view.*,
                animation_clips.legacy
            FROM object_view INNER JOIN animation_clips ON object_view.id = animation_clips.id
        ''')


class AssetBundleHandler(BaseHandler):
    def __init__(self, id_generator, file_index):
        super(AssetBundleHandler, self).__init__(id_generator, file_index)

    def process(self, current_id, obj, cursor, bundle_id):
        name = obj["m_Name"].value
        
        for key, asset in obj["m_Container"].value.items():
            if "data" in key:
                pptr = asset.value["second"].value["asset"]

                if pptr.value["ID"] == 0:
                    # Scenes have null pptr, ignore them.
                    continue

                obj_id = self._id_generator.get_id(pptr.value["GlobalFileIndex"], pptr.value["ID"])

                cursor.execute('''
                    INSERT OR REPLACE INTO assets(bundle_id, name, obj_id)
                        VALUES(?,?,?)
                ''', (
                    bundle_id,
                    asset.value["first"].value,
                    obj_id)
                )

        return (name,) + self._recursive_process(obj, "")

    def init_database(self, cursor):
        cursor.execute('''
            CREATE TABLE assets(
                bundle_id INTEGER,
                name TEXT,
                obj_id INTEGER,
                PRIMARY KEY (bundle_id, obj_id)
            )
        ''')
        cursor.execute('''
            CREATE VIEW asset_view AS
            SELECT
                assets.name AS asset,
                object_view.*
            FROM assets
            INNER JOIN object_view ON object_view.id = assets.obj_id
        ''')


class IdGenerator(object):
    # Helper class used to generate unique ID for objects.
    def __init__(self):
        # Dict mapping (file_id, object_id) to a unique id.
        self._id_map = {}

    def get_id(self, file_id, object_id):
        # Try to get an existing id.
        _id = self._id_map.get((file_id, object_id))
        if _id is None:
            # Otherwise create a new.
            _id = len(self._id_map)
            self._id_map[(file_id, object_id)] = _id
        return _id


class FileIndex(object):
    # Helper class used to generate unique ID for files.
    def __init__(self, db):
        # List of files.
        self.files = []
        # Dict mapping file name to unique index.
        self.name_to_index = {}
        self._db = db

        # Known built-in files.
        builtin_files = [
            "unity_builtin_extra",
            "unity default resources"
        ]

        cursor = db.cursor()
        # Create files table.
        cursor.execute('''
            CREATE TABLE files(
                id INTEGER,
                name TEXT,
                PRIMARY KEY (id)
            )
        ''')
        db.commit()

        # Add built-in files.
        for f in builtin_files:
            self.get_id(f)

        self._built_in_count = len(self.files)

    def is_built_in(self, index):
        return index < self._built_in_count

    def get_id(self, name):
        # Try to get a file id, generate one if not found.
        name = name.lower()
        index = self.name_to_index.get(name)

        if index is None:
            index = len(self.files)
            self.files.append(name)
            self.name_to_index[name] = index

            # Add file to DB.
            cursor = self._db.cursor()
            cursor.execute('''
            INSERT INTO files(id, name)
                VALUES(?,?)
            ''', (index, name))
            self._db.commit()

        return index

# if running in debug, function to trap stack when breaking 
def debug_signal_handler(signal, frame):
    import pdb
    pdb.set_trace()

def run_tool_with_timeout(tool,filepath,ret_code,time_out,level=0):
    p = Process(run_tool(tool, filepath, ret_code, level))
    p.start()

    # Wait for 60 seconds or until process finishes
    p.join(time_out)

    if p.is_alive():
        print("{0} timeout".format(tool))

        # Terminate
        p.terminate()
        p.join()

# function for spawned process to run WebExtract
def run_tool(tool, filepath, ret_code, level=0):
    with open(os.devnull, 'wb') as devnull:
        path = os.path.join(args.tool_path, tool)
        if(args.verbose == True):
            debug_print("{0} {1}".format(tool, filepath), level)

        p = subprocess.Popen([path, filepath], stdout=devnull, stderr=devnull)
        p.communicate()
        ret_code = p.returncode

# print some output with an optional indent level
def debug_print(msg,level=0):
    indent = ""
    if(args.verbose == True):
        indent = "-> "
        for x in range(level): 
            indent = "-" + indent
    print("{0}{1}".format(indent, msg))

if __name__ == '__main__':  
    main()