tsffs/fuzzer/tokenize/
mod.rs

1// Copyright (C) 2024 Intel Corporation
2// SPDX-License-Identifier: Apache-2.0
3
4//! Tokenization of executables
5
6use anyhow::Result;
7use goblin::{pe::Coff, Object};
8use libafl::prelude::{NaiveTokenizer, Tokenizer};
9use std::{fs::read, path::Path};
10
11// 3 character string minimum
12const STRING_TOKEN_MIN_LEN: usize = 3;
13// Counted in bytes, PE is 16-bit characters, so we need 4 bytes. We set this to 4, because
14// PE strings can just be utf-8 as utf-16, so we don't want to double it.
15const WCHAR_STRING_TOKEN_MIN_LEN: usize = 4;
16
17pub fn tokenize_src_file<I, P>(source_files: I) -> Result<Vec<String>>
18where
19    I: IntoIterator<Item = P>,
20    P: AsRef<Path>,
21{
22    let mut tokens = Vec::new();
23    let tokenizer = NaiveTokenizer::default();
24
25    source_files.into_iter().try_for_each(|f| {
26        tokenizer
27            .tokenize(&read(f.as_ref())?)
28            .map(|t| tokens.extend(t))
29    })?;
30
31    Ok(tokens)
32}
33
34fn tokenize_strings(bytes: &[u8]) -> Result<Vec<Vec<u8>>> {
35    const WCHAR_SIZE: usize = 2;
36    let mut tokens = Vec::new();
37    let mut remap_bytes = Vec::new();
38
39    // Smush sequences of 0 to single 0s
40    bytes.iter().for_each(|b| {
41        if remap_bytes.last().is_some_and(|l| *l == 0) {
42            // Nothing
43        } else {
44            remap_bytes.push(*b);
45        }
46    });
47
48    remap_bytes
49        .split(|b| *b == 0)
50        .filter(|b| {
51            // If we can interpret a nul-terminated slice as a utf-16 or utf-8 string, we take it
52            // as a token
53            (b.len() % 2 == 0
54                && b.len() >= WCHAR_STRING_TOKEN_MIN_LEN
55                && String::from_utf16(
56                    &b.chunks_exact(WCHAR_SIZE)
57                        // Big endian re-encode as &[u16]
58                        .map(|c| (c[0] as u16) << 8 | c[1] as u16)
59                        .collect::<Vec<_>>(),
60                )
61                .is_ok())
62                || (b.len() >= STRING_TOKEN_MIN_LEN && String::from_utf8(b.to_vec()).is_ok())
63        })
64        .for_each(|b| tokens.push(b.to_vec()));
65
66    Ok(tokens)
67}
68
69/// Naively tokenize an executable file by parsing its data sections. This very much assumes the
70/// executable isn't behaving badly and that strings in it are actually in the data section.
71///
72/// For ELF executables, we take all non-executable and non-alloc sections.
73///
74/// For PE and COFF executables, we take the reserved sections .data and .rdata as noted in the
75/// [docs](https://learn.microsoft.com/en-us/windows/win32/debug/pe-format#special-sections).
76pub fn tokenize_executable_file<P>(executable: P) -> Result<Vec<Vec<u8>>>
77where
78    P: AsRef<Path>,
79{
80    let mut tokens = Vec::new();
81    let contents = read(executable.as_ref())?;
82
83    match Object::parse(&contents)? {
84        Object::Elf(e) => {
85            e.section_headers
86                .iter()
87                .filter(|sh| !sh.is_executable() && !sh.is_alloc())
88                .filter_map(|sh| sh.file_range())
89                .try_for_each(|range| {
90                    tokenize_strings(&contents[range]).map(|t| tokens.extend(t))
91                })?;
92        }
93        Object::PE(p) => {
94            p.sections
95                .iter()
96                .filter(|s| s.name().is_ok_and(|n| n == ".rdata" || n == ".data"))
97                .try_for_each(|s| {
98                    tokenize_strings(
99                        &contents[s.pointer_to_raw_data as usize
100                            ..s.pointer_to_raw_data as usize + s.size_of_raw_data as usize],
101                    )
102                    .map(|t| tokens.extend(t))
103                })?;
104        }
105        _ => {}
106    }
107
108    if let Ok(coff) = Coff::parse(&contents) {
109        coff.sections
110            .iter()
111            .filter(|s| s.name().is_ok_and(|n| n == ".rdata" || n == ".data"))
112            .try_for_each(|s| {
113                tokenize_strings(
114                    &contents[s.pointer_to_raw_data as usize
115                        ..s.pointer_to_raw_data as usize + s.size_of_raw_data as usize],
116                )
117                .map(|t| tokens.extend(t))
118            })?;
119    }
120
121    Ok(tokens)
122}