Skip to content

Commit

Permalink
add support of lief, based on a suggestion by trou
Browse files Browse the repository at this point in the history
  • Loading branch information
LRGH committed Mar 3, 2024
1 parent a2a357f commit 9487932
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 42 deletions.
53 changes: 34 additions & 19 deletions .github/workflows/portability.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,27 +31,42 @@ jobs:
unxz cpu_rec_corpus/*.xz
- name: Analyze /bin/sh
run: |
python ./cpu_rec.py /bin/sh
python ./cpu_rec.py -v /bin/sh
- name: Same, but should be faster because using pickled data
run: |
python ./cpu_rec.py /bin/sh
rm stats.pick
- name: Analyze /bin/sh with LRGH's elfesteem
python ./cpu_rec.py -v /bin/sh
- name: Install three variants of elfesteem
run: |
git clone https://github.com/LRGH/elfesteem
export PYTHONPATH=./elfesteem
python ./cpu_rec.py /bin/sh
rm -rf elfesteem
rm stats.pick
mv elfesteem elfesteem.LRGH
git clone https://github.com/airbus-seclab/elfesteem
mv elfesteem elfesteem.seclab
git clone https://github.com/serpilliere/elfesteem
mv elfesteem elfesteem.serpi
- name: Analyze /bin/sh with LRGH's elfesteem
run: |
export PYTHONPATH=./elfesteem.LRGH
python ./cpu_rec.py -v /bin/sh
- name: Analyze /bin/sh with airbus-seclab's elfesteem
if: ${{ steps.pyver.outputs.pyver == 2 && matrix.os != 'macos-latest' }}
# To check why it does not work with MacOSX
run: |
git clone https://github.com/airbus-seclab/elfesteem
export PYTHONPATH=./elfesteem
python ./cpu_rec.py /bin/sh
rm -rf elfesteem
rm stats.pick
export PYTHONPATH=./elfesteem.seclab
python ./cpu_rec.py -v /bin/sh
- name: Analyze /bin/sh with serpilliere's elfesteem
if: ${{ steps.pyver.outputs.pyver == 2 }}
run: |
export PYTHONPATH=./elfesteem.serpi
python ./cpu_rec.py -v /bin/sh
- name: Install lief
if: ${{ steps.pyver.outputs.pyver == 3 && !(matrix.os == 'macos-latest' && matrix.python-version == 'graalpy-22.3') }}
# To check why lief/graalpy does not work with MacOSX
run: |
pip install lief
- name: Analyze /bin/sh with lief
if: ${{ steps.pyver.outputs.pyver == 3 && !(matrix.os == 'macos-latest' && matrix.python-version == 'graalpy-22.3') }}
run: |
python ./cpu_rec.py -v /bin/sh
python24:
runs-on: "ubuntu-latest"
Expand All @@ -72,17 +87,17 @@ jobs:
python -c 'import sys;print(sys.version)'
- name: Analyze /bin/sh
run: |
python ./cpu_rec.py /bin/sh
python ./cpu_rec.py -v /bin/sh
- name: Same, but should be faster because using pickled data
run: |
python ./cpu_rec.py /bin/sh
rm stats.pick
- name: Analyze /bin/sh with LRGH's elfesteem
python ./cpu_rec.py -v /bin/sh
- name: Install elfesteem
run: |
git clone https://github.com/LRGH/elfesteem
- name: Analyze /bin/sh with LRGH's elfesteem
run: |
export PYTHONPATH=./elfesteem
python ./cpu_rec.py /bin/sh
rm -rf elfesteem
python ./cpu_rec.py -v /bin/sh
test-C:
runs-on: ${{ matrix.os }}
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ files, and outputs the architecture corresponding to this section;
the possibility of extracting the text section is also used when
building a corpus from full binary files.

If https://lief-project.github.io/ is installed, then the tool can
use it instead of elfesteem. If both are installed, option '-l'
gives priority to lief.

Option `-d` followed by a directory dumps the corpus in that directory;
using this option one can reconstruct the default corpus.

Expand Down
83 changes: 60 additions & 23 deletions cpu_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,15 @@
# If the result is not satisfying, prepending twice -v to the arguments
# makes the tool very verbose; this is helpful when adding a new
# architecture to the corpus.
# If https://github.com/airbus-seclab/elfesteem is installed, then the
# If https://github.com/LRGH/elfesteem is installed, then the
# tool also extract the text section from ELF, PE, Mach-O or COFF
# files, and outputs the architecture corresponding to this section;
# the possibility of extracting the text section is also used when
# building a corpus from full binary files.
# NB: other versions than LRGH's need python 2.
# If https://lief-project.github.io/ is installed, then the tool can
# also extract ELF, PE or Mach-O text sections.
# NB: lief needs python 3.
# Option -d followed by a directory dumps the corpus in that directory;
# using this option one can reconstruct the default corpus.

Expand Down Expand Up @@ -92,7 +96,7 @@ def __init__(self):
self.files = []
self.data = []
def dump(self, dumpdir=None):
""" Dump the raw corpus, in a form that won't need elftesteem to be loaded """
""" Dump the raw corpus, in a form that won't need elfesteem to be loaded """
for arch, data in zip(self.archs, self.data):
of = open(dumpdir+'/'+arch.replace('/','-')+'.corpus', 'ab')
of.write(data)
Expand All @@ -117,7 +121,8 @@ def add_training(self, arch, file=None, section='text', data=None, repeat=1):
elif isinstance(section, slice):
data = data[section]
elif isinstance(section, str):
data = TrainingData.extract_section(data, section=section)
d_txt = TrainingData.extract_section_elfesteem(data, section=section)
if d_txt != None: data = d_txt
else:
raise TypeError("Invalid type %s for section in add_training"%section.__class__.__name__)
else:
Expand Down Expand Up @@ -220,13 +225,15 @@ def unpack_file(data):
data = TrainingData.unpack_chex(data)
return data
@staticmethod
def extract_section(data, section=False):
def extract_section_elfesteem(data, section=False):
# Extract text sections from know containers
# elfesteem has to be installed
try:
import elfesteem
except ImportError:
return data
log.info("Could not load elfesteem")
return None
log.info("Extracting %s section with elfesteem", section)
magic = ( 0x7f,0x45,0x4c,0x46 )
if data.startswith(struct.pack("%dB"%len(magic),*magic)):
from elfesteem import elf_init
Expand Down Expand Up @@ -262,6 +269,25 @@ def extract_section(data, section=False):
except ValueError:
pass
return data
@staticmethod
def extract_section_lief(data, section=None):
# Extract text sections from know containers
# elfesteem has to be installed
try:
import lief
except ImportError:
log.info("Could not load lief")
return None
log.info("Extracting %s section with lief", section)
l = lief.parse(data)
if not l:
return data
res = b""
section_names = [section] if section else [".text", "__TEXT"]
for s in l.sections:
if s.name in section_names:
res += s.content
return res
def read_corpus(self):
""" Gets the raw training dataset """
basedir = os.path.dirname(os.path.realpath(__file__))
Expand Down Expand Up @@ -739,24 +765,30 @@ def which_arch(d = None, training = {}):


if __name__ == "__main__":
fast, dump = False, False
fast, dump, use_lief = False, False, False
argv = sys.argv[1:]
if len(argv) and argv[0] == '-d':
dump = True
assert len(argv) == 2
dumpdir = argv[1]
if not os.path.isdir(dumpdir):
log.error("Directory %r should be created before running the tool", dumpdir)
sys.exit(1)
if len(argv) and argv[0] == '-f':
fast = True
argv = argv[1:]
if len(argv) and argv[0] == '-v':
log.setLevel(logging.INFO)
argv = argv[1:]
if len(argv) and argv[0] == '-v':
log.setLevel(logging.DEBUG)
while len(argv):
if argv[0] == '-d':
dump = True
assert len(argv) == 2
dumpdir = argv[1]
if not os.path.isdir(dumpdir):
log.error("Directory %r should be created before running the tool", dumpdir)
sys.exit(1)
elif argv[0] == '-f':
fast = True
argv = argv[1:]
elif argv[0] == '-l':
use_lief = True
argv = argv[1:]
elif argv[0] == '-v':
if log.getEffectiveLevel() == 30:
log.setLevel(logging.INFO)
else:
log.setLevel(logging.DEBUG)
argv = argv[1:]
else:
break
if dump:
# Always recompute data for dump
t = TrainingData()
Expand Down Expand Up @@ -803,8 +835,13 @@ def which_arch(d = None, training = {}):
log.debug(" %s", r2[:4])
log.debug(" %s", r3[:4])
# Text section, if possible
d_txt = TrainingData.extract_section(d, section='text')
if len(d) != len(d_txt):
if use_lief:
d_txt = TrainingData.extract_section_lief(d, section='text')
else:
d_txt = TrainingData.extract_section_elfesteem(d, section='text')
if d_txt == None:
d_txt = TrainingData.extract_section_lief(d, section='text')
if d_txt != None:
res, r2, r3 = p.deduce(d_txt)
sys.stdout.write('%-15s%-10s' % ('text(%#x)' % len(d_txt), res))
else:
Expand Down

0 comments on commit 9487932

Please sign in to comment.