tsffs/fuzzer/tokenize/
mod.rs1use anyhow::Result;
7use goblin::{pe::Coff, Object};
8use libafl::prelude::{NaiveTokenizer, Tokenizer};
9use std::{fs::read, path::Path};
10
11const STRING_TOKEN_MIN_LEN: usize = 3;
13const WCHAR_STRING_TOKEN_MIN_LEN: usize = 4;
16
17pub fn tokenize_src_file<I, P>(source_files: I) -> Result<Vec<String>>
18where
19 I: IntoIterator<Item = P>,
20 P: AsRef<Path>,
21{
22 let mut tokens = Vec::new();
23 let tokenizer = NaiveTokenizer::default();
24
25 source_files.into_iter().try_for_each(|f| {
26 tokenizer
27 .tokenize(&read(f.as_ref())?)
28 .map(|t| tokens.extend(t))
29 })?;
30
31 Ok(tokens)
32}
33
34fn tokenize_strings(bytes: &[u8]) -> Result<Vec<Vec<u8>>> {
35 const WCHAR_SIZE: usize = 2;
36 let mut tokens = Vec::new();
37 let mut remap_bytes = Vec::new();
38
39 bytes.iter().for_each(|b| {
41 if remap_bytes.last().is_some_and(|l| *l == 0) {
42 } else {
44 remap_bytes.push(*b);
45 }
46 });
47
48 remap_bytes
49 .split(|b| *b == 0)
50 .filter(|b| {
51 (b.len() % 2 == 0
54 && b.len() >= WCHAR_STRING_TOKEN_MIN_LEN
55 && String::from_utf16(
56 &b.chunks_exact(WCHAR_SIZE)
57 .map(|c| (c[0] as u16) << 8 | c[1] as u16)
59 .collect::<Vec<_>>(),
60 )
61 .is_ok())
62 || (b.len() >= STRING_TOKEN_MIN_LEN && String::from_utf8(b.to_vec()).is_ok())
63 })
64 .for_each(|b| tokens.push(b.to_vec()));
65
66 Ok(tokens)
67}
68
69pub fn tokenize_executable_file<P>(executable: P) -> Result<Vec<Vec<u8>>>
77where
78 P: AsRef<Path>,
79{
80 let mut tokens = Vec::new();
81 let contents = read(executable.as_ref())?;
82
83 match Object::parse(&contents)? {
84 Object::Elf(e) => {
85 e.section_headers
86 .iter()
87 .filter(|sh| !sh.is_executable() && !sh.is_alloc())
88 .filter_map(|sh| sh.file_range())
89 .try_for_each(|range| {
90 tokenize_strings(&contents[range]).map(|t| tokens.extend(t))
91 })?;
92 }
93 Object::PE(p) => {
94 p.sections
95 .iter()
96 .filter(|s| s.name().is_ok_and(|n| n == ".rdata" || n == ".data"))
97 .try_for_each(|s| {
98 tokenize_strings(
99 &contents[s.pointer_to_raw_data as usize
100 ..s.pointer_to_raw_data as usize + s.size_of_raw_data as usize],
101 )
102 .map(|t| tokens.extend(t))
103 })?;
104 }
105 _ => {}
106 }
107
108 if let Ok(coff) = Coff::parse(&contents) {
109 coff.sections
110 .iter()
111 .filter(|s| s.name().is_ok_and(|n| n == ".rdata" || n == ".data"))
112 .try_for_each(|s| {
113 tokenize_strings(
114 &contents[s.pointer_to_raw_data as usize
115 ..s.pointer_to_raw_data as usize + s.size_of_raw_data as usize],
116 )
117 .map(|t| tokens.extend(t))
118 })?;
119 }
120
121 Ok(tokens)
122}