Merge pull request #1010 from ehrenb/refactor-analysis

Ensure all Fields and Strings get wrapped as FieldAnalysis and StringAnalysis respectivly when creating Analysis
This commit is contained in:
erev0s 2024-03-10 23:33:34 +02:00 committed by GitHub
commit 9f7a3c6a74
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 297 additions and 102 deletions

View File

@ -681,7 +681,9 @@ class MethodAnalysis:
self.method.get_access_flags_string(),
self.method.get_name(),
", ".join(args), ret))
bytecode.PrettyShow(self.basic_blocks.gets(), self.method.notes)
if not self.is_external():
bytecode.PrettyShow(self.basic_blocks.gets(), self.method.notes)
def show_xrefs(self):
data = "XREFto for %s\n" % self.method
@ -982,6 +984,7 @@ class ClassAnalysis:
"""
def __init__(self, classobj):
logger.info(f"Adding new ClassAnalysis: {classobj}")
# Automatically decide if the class is external or not
self.external = isinstance(classobj, ExternalClass)
@ -1118,6 +1121,18 @@ class ClassAnalysis:
def get_field_analysis(self, field):
return self._fields.get(field)
def add_field(self, field_analysis):
"""
Add the given field to this analyis.
usually only called during Analysis.add
:param FieldAnalysis field_analysis:
"""
self._fields[field_analysis.get_field()] = field_analysis
# if self.external:
# # Propagate ExternalField to ExternalClass
# self.orig_class.add_method(field_analysis.get_field())
def add_field_xref_read(self, method, classobj, field, off):
"""
Add a Field Read to this class
@ -1391,14 +1406,14 @@ class Analysis:
It encapsulates all the Dalvik related functions into a single place, while you have still the ability to use
the functions from :class:`~androguard.core.bytecodes.dvm.DEX` and the related classes.
:param Optional[androguard.core.bytecodes.dvm.DEX] vm: inital DEX object (default None)
:param Optional[androguard.core.dex.DEX] vm: inital DEX object (default None)
"""
def __init__(self, vm=None):
# Contains DEX objects
self.vms = []
# A dict of {classname: ClassAnalysis}, populated on add(vm)
self.classes = dict()
# A dict of {string: StringAnalysis}, populated on create_xref()
# A dict of {string: StringAnalysis}, populated on add(vm) and create_xref()
self.strings = dict()
# A dict of {EncodedMethod: MethodAnalysis}, populated on add(vm)
self.methods = dict()
@ -1411,12 +1426,18 @@ class Analysis:
self.__created_xrefs = False
@property
def fields(self):
"""Returns FieldAnalysis list"""
return self.get_fields()
def add(self, vm):
"""
Add a DEX to this Analysis.
:param androguard.core.bytecodes.dvm.DEX vm: :class:`dvm.DEX` to add to this Analysis
:param androguard.core.dex.DEX vm: :class:`androguard.core.dex.DEX` to add to this Analysis
"""
self.vms.append(vm)
logger.info("Adding DEX file version {}".format(vm.version))
@ -1424,8 +1445,10 @@ class Analysis:
# TODO: This step can easily be multithreaded, as there is no dependecy between the objects at this stage
tic = time.time()
for i, current_class in enumerate(vm.get_classes()):
# seed ClassAnalysis objects into classes attribute and add as new class
self.classes[current_class.get_name()] = ClassAnalysis(current_class)
new_class = self.classes[current_class.get_name()]
# Fix up the hidden api annotations (Android 10)
hidden_api = vm.get_hidden_api()
if hidden_api:
@ -1433,15 +1456,26 @@ class Analysis:
new_class.set_restriction_flag(rf)
new_class.set_domain_flag(df)
# seed MethodAnalysis objects into methods attribute and add to new class analysis
for method in current_class.get_methods():
self.methods[method] = MethodAnalysis(vm, method)
new_class.add_method(self.methods[method])
# Store for faster lookup during create_xrefs
m_hash = (current_class.get_name(), method.get_name(), str(method.get_descriptor()))
self.__method_hashes[m_hash] = self.methods[method]
# seed FieldAnalysis objects into to new class analysis
# since we access methods through a class property,
# which returns what's within a ClassAnalysis
# we don't have to track it internally in this class
for field in current_class.get_fields():
new_class.add_field(FieldAnalysis(field))
# seed StringAnalysis objects into strings attribute - connect alter using xrefs
for string_value in vm.get_strings():
self.strings[string_value] = StringAnalysis(string_value)
logger.info("Added DEX in the analysis took : {:0d}min {:02d}s".format(*divmod(int(time.time() - tic), 60)))
def create_xref(self):
@ -1493,7 +1527,7 @@ class Analysis:
Note that this might be quite slow, as all instructions are parsed.
:param androguard.core.bytecodes.dvm.ClassDefItem current_class: The class to create xrefs for
:param androguard.core.dex.ClassDefItem current_class: The class to create xrefs for
"""
cur_cls_name = current_class.get_name()
@ -1585,7 +1619,7 @@ class Analysis:
elif 0x52 <= op_value <= 0x6d:
idx_field = instruction.get_ref_kind()
field_info = instruction.cm.vm.get_cm_field(idx_field)
field_item = instruction.cm.vm.get_field_descriptor(field_info[0], field_info[2], field_info[1])
field_item = instruction.cm.vm.get_encoded_field_descriptor(field_info[0], field_info[2], field_info[1])
if not field_item:
continue
@ -1731,6 +1765,28 @@ class Analysis:
if not cls.is_external():
yield cls
def get_internal_methods(self):
"""
Returns all internal methods, that means all methods that are
defined in the given set of :class:`~DEX`.
:rtype: Iterator[MethodAnalysis]
"""
for m in self.methods.values():
if not m.is_external():
yield m
def get_external_methods(self):
"""
Returns all external methods, that means all methods that are not
defined in the given set of :class:`~DEX`.
:rtype: Iterator[MethodAnalysis]
"""
for m in self.methods.values():
if m.is_external():
yield m
def get_strings_analysis(self):
"""
Returns a dictionary of strings and their corresponding :class:`StringAnalysis`

View File

@ -2480,7 +2480,7 @@ class FieldHIdItem:
def __init__(self, size, buff, cm):
self.offset = buff.tell()
self.elem = [FieldIdItem(buff, cm) for i in range(0, size)]
self.field_id_items = [FieldIdItem(buff, cm) for i in range(0, size)]
def set_off(self, off):
self.offset = off
@ -2489,30 +2489,30 @@ class FieldHIdItem:
return self.offset
def gets(self):
return self.elem
return self.field_id_items
def get(self, idx):
try:
return self.elem[idx]
return self.field_id_items[idx]
except IndexError:
return FieldIdItemInvalid()
def show(self):
nb = 0
for i in self.elem:
for i in self.field_id_items:
print(nb, end=' ')
i.show()
nb = nb + 1
def get_obj(self):
return [i for i in self.elem]
return [i for i in self.field_id_items]
def get_raw(self):
return b''.join(i.get_raw() for i in self.elem)
return b''.join(i.get_raw() for i in self.field_id_items)
def get_length(self):
length = 0
for i in self.elem:
for i in self.field_id_items:
length += i.get_length()
return length
@ -2659,7 +2659,7 @@ class MethodHIdItem:
self.offset = buff.tell()
self.methods = [MethodIdItem(buff, cm) for i in range(0, size)]
self.method_id_items = [MethodIdItem(buff, cm) for i in range(0, size)]
def set_off(self, off):
self.offset = off
@ -2667,33 +2667,36 @@ class MethodHIdItem:
def get_off(self):
return self.offset
def gets(self):
return self.method_id_items
def get(self, idx):
try:
return self.methods[idx]
return self.method_id_items[idx]
except IndexError:
return MethodIdItemInvalid()
def reload(self):
for i in self.methods:
for i in self.method_id_items:
i.reload()
def show(self):
print("METHOD_ID_ITEM")
nb = 0
for i in self.methods:
for i in self.method_id_items:
print(nb, end=' ')
i.show()
nb = nb + 1
def get_obj(self):
return [i for i in self.methods]
return [i for i in self.method_id_items]
def get_raw(self):
return b''.join(i.get_raw() for i in self.methods)
return b''.join(i.get_raw() for i in self.method_id_items)
def get_length(self):
length = 0
for i in self.methods:
for i in self.method_id_items:
length += i.get_length()
return length
@ -3679,7 +3682,7 @@ class ClassDefItem:
def get_methods(self):
"""
Return all methods of this class
Return all EncodedMethods of this class
:rtype: a list of :class:`EncodedMethod` objects
"""
@ -3689,7 +3692,7 @@ class ClassDefItem:
def get_fields(self):
"""
Return all fields of this class
Return all EncodedFields of this class
:rtype: a list of :class:`EncodedField` objects
"""
@ -7961,6 +7964,14 @@ class DEX:
# There is a rare case that the DEX has no classes
return []
def get_len_classes(self):
"""
Return the number of classes
:rtype: int
"""
return len(self.get_classes())
def get_class(self, name):
"""
Return a specific class
@ -7974,41 +7985,23 @@ class DEX:
return i
return None
def get_method(self, name):
"""
Return a list all methods which corresponds to the regexp
:param name: the name of the method (a python regexp)
:rtype: a list with all :class:`EncodedMethod` objects
"""
# TODO could use a generator here
prog = re.compile(name)
l = []
for i in self.get_classes():
for j in i.get_methods():
if prog.match(j.get_name()):
l.append(j)
return l
def get_field(self, name):
"""
Return a list all fields which corresponds to the regexp
"""get field id item by name
:param name: the name of the field (a python regexp)
:rtype: a list with all :class:`EncodedField` objects
:type name: str
:return: the list of matching :class:`FieldIdItem` objects
:rtype: list
"""
# TODO could use a generator here
prog = re.compile(name)
l = []
for i in self.get_classes():
for j in i.get_fields():
if prog.match(j.get_name()):
l.append(j)
for i in self.get_fields():
if prog.match(i.name):
l.append(i)
return l
def get_all_fields(self):
def get_fields(self):
"""
Return a list of field items
@ -8019,7 +8012,31 @@ class DEX:
except AttributeError:
return []
def get_fields(self):
def get_len_fields(self):
"""
Return the number of fields
:rtype: int
"""
return len(self.get_fields())
def get_encoded_field(self, name):
"""
Return a list all fields which corresponds to the regexp
:param name: the name of the field (a python regexp)
:rtype: a list with all :class:`EncodedField` objects
"""
# TODO could use a generator here
prog = re.compile(name)
l = []
for i in self.get_encoded_fields():
if prog.match(i.get_name()):
l.append(i)
return l
def get_encoded_fields(self):
"""
Return all field objects
@ -8032,9 +8049,76 @@ class DEX:
self.__cache_all_fields.append(j)
return self.__cache_all_fields
def get_len_encoded_fields(self):
return len(self.get_encoded_fields())
def get_field(self, name):
"""get field id item by name
:param name: the name of the field (a python regexp)
:type name: str
:return: the list of matching :class:`FieldIdItem` objects
:rtype: list
"""
prog = re.compile(name)
l = []
for i in self.get_fields():
if prog.match(i.name):
l.append(i)
return l
def get_method(self, name):
"""get method id item by name
:param name: the name of the field (a python regexp)
:type name: str
:return: the list of matching :class:`MethodIdItem` objects
:rtype: list
"""
prog = re.compile(name)
l = []
for i in self.get_methods():
if prog.match(i.name):
l.append(i)
return l
def get_methods(self):
"""
Return all method objects
Return a list of method items
:rtype: a list of :class:`MethodIdItem` objects
"""
try:
return self.methods.gets()
except AttributeError:
return []
def get_len_methods(self):
"""
Return the number of methods
:rtype: int
"""
return len(self.get_methods())
def get_encoded_method(self, name):
"""
Return a list all encoded methods whose name corresponds to the regexp
:param name: the name of the method (a python regexp)
:rtype: a list with all :class:`EncodedMethod` objects
"""
prog = re.compile(name)
l = []
for i in self.get_encoded_methods():
if prog.match(i.name):
l.append(i)
return l
def get_encoded_methods(self):
"""
Return all encoded method objects
:rtype: a list of :class:`EncodedMethod` objects
"""
@ -8045,17 +8129,17 @@ class DEX:
self.__cache_all_methods.append(j)
return self.__cache_all_methods
def get_len_methods(self):
def get_len_encoded_methods(self):
"""
Return the number of methods
Return the number of encoded methods
:rtype: int
"""
return len(self.get_methods())
return len(self.get_encoded_methods())
def get_method_by_idx(self, idx):
def get_encoded_method_by_idx(self, idx):
"""
Return a specific method by using an index
Return a specific encoded method by using an index
:param idx: the index of the method
:type idx: int
@ -8072,9 +8156,9 @@ class DEX:
except KeyError:
return None
def get_method_descriptor(self, class_name, method_name, descriptor):
def get_encoded_method_descriptor(self, class_name, method_name, descriptor):
"""
Return the specific method
Return the specific encoded method given a class name, method name, and descriptor
:param class_name: the class name of the method
:type class_name: string
@ -8096,9 +8180,9 @@ class DEX:
return self.__cache_methods.get(key)
def get_methods_descriptor(self, class_name, method_name):
def get_encoded_methods_class_method(self, class_name, method_name):
"""
Return the specific methods of the class
Return the specific encoded methods of the class
:param class_name: the class name of the method
:type class_name: string
@ -8107,18 +8191,14 @@ class DEX:
:rtype: None or a :class:`EncodedMethod` object
"""
l = []
for i in self.get_classes():
if i.get_name() == class_name:
for j in i.get_methods():
if j.get_name() == method_name:
l.append(j)
for i in self.get_encoded_methods():
if i.get_name() == method_name and i.get_class_name() == class_name:
return i
return None
return l
def get_methods_class(self, class_name):
def get_encoded_methods_class(self, class_name):
"""
Return all methods of a specific class
Return all encoded methods of a specific class by class name
:param class_name: the class name
:type class_name: string
@ -8126,16 +8206,14 @@ class DEX:
:rtype: a list with :class:`EncodedMethod` objects
"""
l = []
for i in self.get_classes():
for j in i.get_methods():
if class_name == j.get_class_name():
l.append(j)
for i in self.get_encoded_methods():
if class_name == i.get_class_name():
l.append(i)
return l
def get_fields_class(self, class_name):
def get_encoded_fields_class(self, class_name):
"""
Return all fields of a specific class
Return all encoded fields of a specific class by class name
:param class_name: the class name
:type class_name: string
@ -8143,16 +8221,14 @@ class DEX:
:rtype: a list with :class:`EncodedField` objects
"""
l = []
for i in self.get_classes():
for j in i.get_fields():
if class_name == j.get_class_name():
l.append(j)
for i in self.get_encoded_fields():
if class_name == i.get_class_name():
l.append(i)
return l
def get_field_descriptor(self, class_name, field_name, descriptor):
def get_encoded_field_descriptor(self, class_name, field_name, descriptor):
"""
Return the specific field
Return the specific encoded field given a class name, field name, and descriptor
:param class_name: the class name of the field
:type class_name: string
@ -8186,6 +8262,14 @@ class DEX:
"""
return [i.get() for i in self.strings]
def get_len_strings(self):
"""
Return the number of strings
:rtype: int
"""
return len(self.get_strings())
def get_regex_strings(self, regular_expressions):
"""
Return all target strings matched the regex

View File

@ -21,11 +21,27 @@ class AnalysisTest(unittest.TestCase):
def testAPK(self):
a, d, dx = AnalyzeAPK(os.path.join(test_dir, "data/APK/a2dp.Vol_137.apk"))
self.assertEqual(len(list(dx.get_internal_classes())), 1353) # checked by reading the dex header
self.assertEqual(len(dx.get_strings()), 1564)
self.assertEqual(len(list(dx.get_methods())), 12792) # according to DEX Header 12795
self.assertEqual(len(list(dx.get_fields())), 3033) # According to DEX Header 4005
self.assertEqual(len(list(dx.get_external_classes())), 388)
self.assertEqual(len(list(dx.get_internal_classes())), 1353) # dex header header->headerItem->classDefsSize
self.assertEqual(len(list(dx.get_external_classes())), 388) # difficult to check, cannot find using JADX
self.assertEqual(len(list(dx.get_classes())), 1741) # sum of internal and external classes
self.assertEqual(len(dx.get_strings()), 13523) # dex header header->headerItem->stringsIdsSize
# don't have a way to discern external vs internal fields currently,
# header->headerItemFieldIdsSize is 4005, but there must be 573 more external added
# so this is difficult to derive. Even JADX seems to disagree with 4005 number?
self.assertEqual(len(list(dx.get_fields())), 4005 + 573)
# internal+external methods should sum up to header->headerItem->methodIdsSize
self.assertEqual(len(list(dx.get_internal_methods())), 9676) # difficult to check, can use jadx-gui and see summary
self.assertEqual(len(list(dx.get_external_methods())), 3116) # difficult to check
# TODO: the DEX header says 12795 here, but 9676 + 3116 adds up to 12792
# JADX corroborates 9676, so I think 3116 is off, and a few unncessary
# ExternalMethods are added somewhere
self.assertEqual(len(list(dx.get_methods())), 12792) # dex header header->headerItem->methodIdsSize
for cls in dx.get_external_classes():
self.assertEqual(cls.name[0], 'L')
@ -44,15 +60,23 @@ class AnalysisTest(unittest.TestCase):
no_external=True))), 94)
# Find url like strings
self.assertEqual(len(list(dx.find_strings(r".*:\/\/.*"))), 15)
self.assertEqual(len(list(dx.find_strings(r".*:\/\/.*"))), 16)
# find String fields
self.assertEqual(len(list(dx.find_fields(classname="^(?!Landroid).*;$", fieldtype=r"Ljava\/lang\/String;"))),
63)
95)#63)
def testAnalysis(self):
import sys
h, d, dx = AnalyzeDex(os.path.join(test_dir, "data/APK/AnalysisTest.dex"))
self.assertEqual(len(list(dx.get_internal_classes())), 1)
self.assertEqual(len(list(dx.get_internal_methods())), 4)
self.assertEqual(len(list(dx.get_external_methods())), 4)
self.assertEqual(len(list(dx.get_methods())), 8)
self.assertEqual(len(dx.get_strings()), 21)
self.assertEqual(len(list(dx.get_fields())), 0)
self.assertEqual(h, "4595fc25104f3fcd709163eb70ca476edf116753607ec18f09548968c71910dc")
self.assertIsInstance(d, DEX)
self.assertIsInstance(dx, analysis.Analysis)
@ -269,7 +293,7 @@ class AnalysisTest(unittest.TestCase):
"""Tests if String offsets in bytecode are correctly stored"""
_, _, dx = AnalyzeDex(os.path.join(test_dir, "data/APK/AnalysisTest.dex"))
self.assertEqual(len(dx.get_strings()), 1)
self.assertEqual(len(dx.get_strings()), 21)
self.assertIsInstance(dx.strings['Hello world'], analysis.StringAnalysis)
sa = dx.strings['Hello world']
@ -282,7 +306,7 @@ class AnalysisTest(unittest.TestCase):
"""Tests if Field offsets in bytecode are correctly stored"""
_, _, dx = AnalyzeDex(os.path.join(test_dir, "data/APK/FieldsTest.dex"))
self.assertEqual(len(dx.get_strings()), 4)
self.assertEqual(len(dx.get_strings()), 20)
self.assertIn('hello world', dx.strings.keys())
self.assertIn('sdf', dx.strings.keys())
self.assertIn('hello mars', dx.strings.keys())
@ -350,6 +374,5 @@ class AnalysisTest(unittest.TestCase):
self.assertEqual(class1.restriction_flag, HiddenApiClassDataItem.RestrictionApiFlag.BLACKLIST)
self.assertEqual(class1.domain_flag, HiddenApiClassDataItem.DomapiApiFlag.NONE)
if __name__ == '__main__':
unittest.main()

View File

@ -17,15 +17,47 @@ class MockClassManager():
def get_odex_format(self):
return False
class accessflagsTest(unittest.TestCase):
class VMClassTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
test_apk_path = os.path.join(test_dir, 'data/APK/TestActivity.apk')
cls.a, cls.d, cls.dx = AnalyzeAPK(test_apk_path)
def testVMClass(self):
"""test number of ClassDefItems, StringDataItems, FieldIdItems, and MethodIdItems"""
num_class_def_items = 0
num_strings_data_items = 0
num_field_id_items = 0
num_method_id_items = 0
# the below field exists in the fieldIds list, but
# their class doesnt exist, this is bc its loaded at runtime
# 19 [FieldIdItem]: class_idx=0x13 type_idx=0x242 name_idx=0x1099
# classIdx = 0x13 = 19
# typeIdx = 0x242 = 578
# nameIdx = 0x1099 = 4249
# className = Landroid/app/Notification;
# typeName = [J
# fieldName = vibrate
# see DEX format spec https://source.android.com/docs/core/runtime/dex-format
# https://reverseengineering.stackexchange.com/questions/21767/dex-file-referenced-type-is-not-defined-in-file
# field ids, type ids, and method ids references
# are not required to be defined in the dex since they can be resolved at runtime via shared library
for vm in self.dx.vms:
num_class_def_items += vm.get_len_classes() # ClassDefItems
num_strings_data_items += vm.get_len_strings() # StringDataItems
num_field_id_items += vm.get_len_fields() # FieldIdItems
num_method_id_items += vm.get_len_methods() # MethodIdItems
self.assertEqual(len(self.dx.vms), 1)
self.assertEqual(num_class_def_items, 340)
self.assertEqual(num_strings_data_items, 4329)
self.assertEqual(num_field_id_items, 865)
self.assertEqual(num_method_id_items, 3602)
def testAccessflags(self):
class_name_accessflag_map = {
'Ltests/androguard/TestLoops;': {
'access_flag': 0x1, # public

View File

@ -24,7 +24,7 @@ class TestDexCodeParsing(unittest.TestCase):
dif = Differ()
for m in d.get_methods():
for m in d.get_encoded_methods():
if not m.get_code():
continue

View File

@ -18,7 +18,7 @@ class RenameTest(unittest.TestCase):
# self.d.set_vmanalysis(self.dx)
def testMethodRename(self):
meth, = self.d.get_method("testDouble")
meth, = self.d.get_encoded_method("testDouble")
clas = self.d.get_class(meth.get_class_name())
self.assertEqual(meth.get_name(), "testDouble")
self.assertIn(meth.get_name(), [i.name for i in clas.get_methods()])
@ -28,7 +28,7 @@ class RenameTest(unittest.TestCase):
self.assertNotIn("testDouble", [i.name for i in clas.get_methods()])
def testFieldRename(self):
field, = self.d.get_field("FLAG_REGISTER_CONTENT_OBSERVER")
field, = self.d.get_encoded_field("FLAG_REGISTER_CONTENT_OBSERVER")
self.assertEqual(field.get_name(), "FLAG_REGISTER_CONTENT_OBSERVER")
field.set_name("FLAG_REGISTER_CONTENT_OBSERVER_RENAMED")
self.assertEqual(field.get_name(), "FLAG_REGISTER_CONTENT_OBSERVER_RENAMED")

View File

@ -158,7 +158,7 @@ class TypesTest(unittest.TestCase):
with open(TEST_CASE, "rb") as fd:
digest, d, dx = s.addDEX(TEST_CASE, fd.read())
for method in filter(lambda x: x.full_name in VALUES, d.get_methods()):
for method in filter(lambda x: x.full_name in VALUES, d.get_encoded_methods()):
# print("METHOD", method.full_name)
for i in filter(lambda x: 'const' in x.get_name(), method.get_instructions()):