From 8fd082dfe6cf97c2b96b647f15a2aafc8549306d Mon Sep 17 00:00:00 2001 From: Ishan Jain Date: Mon, 20 Jan 2025 07:23:22 +0530 Subject: [PATCH] switched hashmap to fxhash --- Cargo.lock | 16 ++ geofw/Cargo.toml | 1 + geofw/src/main.rs | 34 +++- geofw/src/maxmind.rs | 384 +++++++++++++++++++++++++------------------ run.sh | 2 +- 5 files changed, 277 insertions(+), 160 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ca4e86f..7ec8e41 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -199,6 +199,12 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.9.0" @@ -346,6 +352,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "geofw" version = "0.1.0" @@ -356,6 +371,7 @@ dependencies = [ "aya-log", "clap", "env_logger", + "fxhash", "geofw-common", "geofw-ebpf", "libc", diff --git a/geofw/Cargo.toml b/geofw/Cargo.toml index 729cb03..12a7e6a 100644 --- a/geofw/Cargo.toml +++ b/geofw/Cargo.toml @@ -17,6 +17,7 @@ tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "net" clap = { workspace = true, features = ["derive"] } mio = "1.0.3" maxminddb = "0.24.0" +fxhash = "0.2.1" [build-dependencies] anyhow = { workspace = true } aya-build = { workspace = true } diff --git a/geofw/src/main.rs b/geofw/src/main.rs index 3a2d795..1ce263c 100644 --- a/geofw/src/main.rs +++ b/geofw/src/main.rs @@ -21,7 +21,39 @@ struct Opt { async fn main() -> anyhow::Result<()> { let maxmind_db = maxmind::MaxmindDB::new("./geofw/GeoLite2-City.mmdb"); println!("{:?}", maxmind_db); - maxmind_db.read_binary_search_tree(); + println!( + "{:?}", + maxmind_db.lookup(IpAddr::V6(Ipv6Addr::new( + // 0x2c0f, 0xfe30, 0x4000, 0, 0, 0, 0, 0, + 0x2a0a, 0x6040, 0x4004, 0x10, 0, 0, 0, 0, + ))) + ); + println!( + "{} {}", + maxmind_db.metadata.data_section_start, + maxmind_db.data.len() + ); + + // maxmind_db.read_binary_tree(0, 0); + + let maxmind_db = maxmind::MaxmindDB::new("./geofw/GeoLite2-ASN.mmdb"); + println!("{:?}", maxmind_db); + println!( + "{:?}", + maxmind_db.lookup(IpAddr::V6(Ipv6Addr::new( + // 0x2c0f, 0xfe30, 0x4000, 0, 0, 0, 0, 0, + 0x2a0a, 0x6040, 0x4004, 0x10, 0, 0, 0, 0, + ))) + ); + + println!( + "{} {}", + maxmind_db.metadata.data_section_start, + maxmind_db.data.len() + ); + + maxmind_db.read_binary_tree(0, 0); + return Ok(()); let opt = Opt::parse(); diff --git a/geofw/src/maxmind.rs b/geofw/src/maxmind.rs index e5ca2b7..736aa85 100644 --- a/geofw/src/maxmind.rs +++ b/geofw/src/maxmind.rs @@ -1,10 +1,10 @@ +use fxhash::FxHashMap; use std::{ - cmp::Ordering, - collections::HashMap, fmt::{Debug, Formatter, Result as FmtResult}, fs::File, io::Read, - net::Ipv4Addr, + net::IpAddr, + ops::Range, }; const METADATA_SECTION_START: &[u8] = &[ @@ -12,27 +12,25 @@ const METADATA_SECTION_START: &[u8] = &[ ]; pub struct MaxmindDB { - metadata: Metadata, - data: Vec, + pub metadata: Metadata, + pub data: Vec, } #[derive(Debug, Default)] -struct Metadata { +pub struct Metadata { node_count: u32, record_size: u16, - binary_tree_section_start: usize, - data_section_start: usize, - metadata_section_start: usize, + pub data_section_start: usize, } #[derive(Debug, PartialEq, Clone)] -enum Data { - String(String), +pub enum Data { + String(Range), Double(f64), Bytes(Vec), U16(u16), U32(u32), - Map(HashMap), + Map(FxHashMap), I32(i32), U64(u64), U128(u128), @@ -70,17 +68,15 @@ impl MaxmindDB { }; let m = db.read_metadata(metadata_start); - println!("metadata = {:?}", m); let Data::U16(record_size) = *m.get("record_size").unwrap() else { unreachable!() }; let Data::U32(node_count) = *m.get("node_count").unwrap() else { unreachable!() }; + db.metadata = Metadata { - binary_tree_section_start: 0, data_section_start: ((record_size as usize * 2) / 8) * node_count as usize + 16, - metadata_section_start: metadata_start, record_size, node_count, }; @@ -88,196 +84,268 @@ impl MaxmindDB { db } - fn read_metadata(&self, metadata_start: usize) -> HashMap { + fn read_metadata(&self, metadata_start: usize) -> FxHashMap { let (Data::Map(map), _) = self.read_data(metadata_start) else { unreachable!() }; map } - pub fn read_binary_search_tree(&self) { - // Only support 28bit format for now - assert_eq!(self.metadata.record_size, 28); + fn node_from_bytes(n: &[u8], bit: u128, record_size: u16) -> u32 { + match record_size { + 28 => { + if bit == 0 { + u32::from_be_bytes([(n[3] & 0b1111_0000) >> 4, n[0], n[1], n[2]]) + } else { + u32::from_be_bytes([n[3] & 0b0000_1111, n[4], n[5], n[6]]) + } + } + 24 => { + if bit == 0 { + u32::from_be_bytes([0, n[0], n[1], n[2]]) + } else { + u32::from_be_bytes([0, n[3], n[4], n[5]]) + } + } + _ => unreachable!(), + } + } + pub fn lookup(&self, addr: IpAddr) -> Option { let node_size = self.metadata.record_size as usize * 2 / 8; - let mut node = 96; - let mut ip = Ipv4Addr::new(139, 84, 164, 110).to_bits(); + let mut node = 0; + let mut ip = match addr { + IpAddr::V4(a) => a.to_bits() as u128, + IpAddr::V6(a) => a.to_bits(), + }; let mut i = 0; - while i < 32 && node < self.metadata.node_count { - let bit = ip & 0x80000000; + while i < 128 && node < self.metadata.node_count { + let bit = ip & (1 << 127); ip <<= 1; let n = &self.data[node as usize * node_size..(node as usize * node_size) + node_size]; - node = if bit == 0 { - u32::from_be_bytes([n[3] & 0b1111_0000, n[0], n[1], n[2]]) - } else { - u32::from_be_bytes([n[3] & 0b0000_1111, n[4], n[5], n[6]]) - }; - + node = Self::node_from_bytes(n, bit, self.metadata.record_size); i += 1; } if node == self.metadata.node_count { - println!("not found!"); + None } else { let data_section_offset = node - self.metadata.node_count; - let data = self + let (data, _) = self .read_data(self.metadata.data_section_start + data_section_offset as usize - 16); - println!("{:?}", data); + Some(data) } } + pub fn read_binary_tree(&self, node: u32, position: usize) { + let mut stack = vec![]; + let node_size = self.metadata.record_size as usize * 2 / 8; + + let mut count = 0; + stack.push((node, position)); + + while let Some((node, position)) = stack.pop() { + let n = &self.data[node as usize * node_size..(node as usize * node_size) + node_size]; + let node_1 = Self::node_from_bytes(n, 0, self.metadata.record_size); + let node_2 = Self::node_from_bytes(n, 1, self.metadata.record_size); + + if position < 128 && node_1 < self.metadata.node_count { + stack.push((node_1, position + 1)); + } + if position < 128 && node_2 < self.metadata.node_count { + stack.push((node_2, position + 1)); + } + + if node_1 > self.metadata.node_count { + let data_section_offset = node_1 - self.metadata.node_count; + + let data = self.read_data( + self.metadata.data_section_start + data_section_offset as usize - 16, + ); + count += 1; + } + + if node_2 > self.metadata.node_count { + let data_section_offset = node_2 - self.metadata.node_count; + let data = self.read_data( + self.metadata.data_section_start + data_section_offset as usize - 16, + ); + + count += 1; + } + } + + println!("count = {}", count); + } + fn read_data(&self, read_offset: usize) -> (Data, usize) { - // println!("read offset: {}", read_offset); let data = &self.data[read_offset..]; let (data_type, length, read) = Self::read_data_meta(data); - // println!("{} {} {}", data_type, length, read); - match data_type { - 1 => { - // println!("read = {:?}", read + read_offset); - // let data = &data[read..]; - let s = (data[0] >> 3) & 0x3; - let v = data[0] & 0b0000_0111; - - let pointer = match s { - 0 => u32::from_be_bytes([0, 0, v, data[1]]), - 1 => u32::from_be_bytes([0, v, data[1], data[2]]) + 2048, - 2 => u32::from_be_bytes([v, data[1], data[2], data[3]]) + 526336, - 3 => u32::from_be_bytes([data[1], data[2], data[3], data[4]]), - _ => unreachable!(), - }; - - let (data, _) = self.read_data(self.metadata.data_section_start + pointer as usize); - (data, s as usize + 1 + 1) - } + 1 => self.follow_pointer(read_offset), 2 => { - let value = &data[read..read + length]; - + // let value = String::from_utf8_lossy(&data[read..read + length]); ( - Data::String(String::from_utf8_lossy(value).to_string()), + Data::String(read_offset + read..read_offset + read + length), read + length, ) } 3 => { assert_eq!(length, 8); - let s = &data[read..read + length]; - let num = f64::from_be_bytes([s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]]); - (Data::Double(num), read + length) + (Self::read_float::<8>(data), read + length) } 4 => { todo!("reached data field???"); } - 5 => { - let slice = &data[read..read + length]; - let number = match *slice { - [] => 0, - [a] => a as u16, - [a, b] => (a as u16) << 8 | b as u16, - _ => unreachable!(), - }; - - (Data::U16(number), read + length) - } - 6 => { - let slice = &data[read..read + length]; - let number = match *slice { - [] => 0, - [a] => a as u32, - [a, b] => (a as u32) << 8 | b as u32, - [a, b, c] => (a as u32) << 16 | (b as u32) << 8 | c as u32, - [a, b, c, d] => { - (a as u32) << 24 | (b as u32) << 16 | (c as u32) << 8 | d as u32 - } - _ => unreachable!(), - }; - - (Data::U32(number), read + length) - } - 7 => { - let mut map = HashMap::with_capacity(length); - // length is number of elements - let mut length = length; - let mut read = read; - - while length > 0 { - let (key, r) = self.read_data(read_offset + read); - read += r; - let (value, r) = self.read_data(read_offset + read); - read += r; - - let Data::String(key) = key else { - unreachable!() - }; - - map.insert(key, value); - length -= 1; - } - - (Data::Map(map), read) - } - 8 => { - let slice = &data[read..read + length]; - let number = match *slice { - [] => 0, - [a] => a as i32, - [a, b] => (a as i32) << 8 | b as i32, - [a, b, c] => (a as i32) << 16 | (b as i32) << 8 | c as i32, - [a, b, c, d] => { - (a as i32) << 24 | (b as i32) << 16 | (c as i32) << 8 | d as i32 - } - _ => unreachable!(), - }; - - (Data::I32(number), read + length) - } - 9 => { - let slice = &data[read..read + length]; - let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| { - acc | ((byte as u64) << (8 * (slice.len() - i - 1))) - }); - - (Data::U64(number), read + length) - } - 10 => { - let slice = &data[read..read + length]; - let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| { - acc | ((byte as u128) << (8 * (slice.len() - i - 1))) - }); - - (Data::U128(number), read + length) - } - 11 => { - let mut read = read; - let mut out = vec![]; - let mut length = length; - - while length > 0 { - let (value, r) = self.read_data(read_offset + read); - read += r; - length -= 1; - out.push(value); - } - - (Data::Array(out), read) - } + 5 => (self.read_u16(read_offset + read, length), read + length), + 6 => (self.read_u32(read_offset + read, length), read + length), + 7 => self.read_map(read_offset, read, length), + 8 => (self.read_i32(read_offset + read, length), read + length), + 9 => (self.read_u64(read_offset + read, length), read + length), + 10 => (self.read_u128(read_offset + read, length), read + length), + 11 => self.read_array(read_offset, read, length), 12 => { todo!("reached data cache container"); } - 13 => (Data::End, read_offset), - 14 => { - todo!("reached boolean"); - } + 13 => (Data::End, read_offset + read), + 14 => (Data::Boolean(length == 1), read), 15 => { assert_eq!(length, 4); - let s = &data[read..read + length]; - let num = f32::from_be_bytes([s[0], s[1], s[2], s[3]]); - (Data::Float(num), read + length) + (Self::read_float::<4>(data), read + length) + } + _ => unreachable!(), + } + } + + fn read_map(&self, offset: usize, read: usize, length: usize) -> (Data, usize) { + let mut map = FxHashMap::with_capacity_and_hasher(length, Default::default()); + // length is number of elements + let mut length = length; + let mut read = read; + + while length > 0 { + let (key, r) = self.read_data(offset + read); + read += r; + let (value, r) = self.read_data(offset + read); + read += r; + + let Data::String(key) = key else { + unreachable!() + }; + let key = String::from_utf8_lossy(&self.data[key]); + + map.insert(key.to_string(), value); + length -= 1; + } + + (Data::Map(map), read) + } + + fn read_array(&self, offset: usize, read: usize, length: usize) -> (Data, usize) { + let mut read = read; + let mut out = vec![]; + let mut length = length; + + while length > 0 { + let (value, r) = self.read_data(offset + read); + read += r; + length -= 1; + out.push(value); + } + + (Data::Array(out), read) + } + + fn read_u16(&self, offset: usize, length: usize) -> Data { + let slice = &self.data[offset..offset + length]; + let number = match *slice { + [] => 0, + [a] => a as u16, + [a, b] => (a as u16) << 8 | b as u16, + _ => unreachable!(), + }; + + Data::U16(number) + } + + fn read_i32(&self, offset: usize, length: usize) -> Data { + let slice = &self.data[offset..offset + length]; + let number = match *slice { + [] => 0, + [a] => a as i32, + [a, b] => (a as i32) << 8 | b as i32, + [a, b, c] => (a as i32) << 16 | (b as i32) << 8 | c as i32, + [a, b, c, d] => (a as i32) << 24 | (b as i32) << 16 | (c as i32) << 8 | d as i32, + _ => unreachable!(), + }; + + Data::I32(number) + } + + fn read_u32(&self, offset: usize, length: usize) -> Data { + let slice = &self.data[offset..offset + length]; + let number = match *slice { + [] => 0, + [a] => a as u32, + [a, b] => (a as u32) << 8 | b as u32, + [a, b, c] => (a as u32) << 16 | (b as u32) << 8 | c as u32, + [a, b, c, d] => (a as u32) << 24 | (b as u32) << 16 | (c as u32) << 8 | d as u32, + _ => unreachable!(), + }; + + Data::U32(number) + } + + fn read_u64(&self, offset: usize, length: usize) -> Data { + let slice = &self.data[offset..offset + length]; + let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| { + acc | ((byte as u64) << (8 * (slice.len() - i - 1))) + }); + + Data::U64(number) + } + + fn read_u128(&self, offset: usize, length: usize) -> Data { + let slice = &self.data[offset..offset + length]; + let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| { + acc | ((byte as u128) << (8 * (slice.len() - i - 1))) + }); + + Data::U128(number) + } + + fn follow_pointer(&self, offset: usize) -> (Data, usize) { + let data = &self.data[offset..]; + let s = (data[0] >> 3) & 0x3; + let v = data[0] & 0b0000_0111; + + let pointer = match s { + 0 => u32::from_be_bytes([0, 0, v, data[1]]), + 1 => u32::from_be_bytes([0, v, data[1], data[2]]) + 2048, + 2 => u32::from_be_bytes([v, data[1], data[2], data[3]]) + 526336, + 3 => u32::from_be_bytes([data[1], data[2], data[3], data[4]]), + _ => unreachable!(), + }; + + let (data, _) = self.read_data(self.metadata.data_section_start + pointer as usize); + (data, s as usize + 1 + 1) + } + + fn read_float(d: &[u8]) -> Data { + match T { + 4 => { + let num = f32::from_be_bytes([d[0], d[1], d[2], d[3]]); + Data::Float(num) + } + 8 => { + let num = f64::from_be_bytes([d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]]); + Data::Double(num) } _ => unreachable!(), } diff --git a/run.sh b/run.sh index 6077259..bbbe2f1 100755 --- a/run.sh +++ b/run.sh @@ -3,6 +3,6 @@ set -e set -o pipefail -RUST_LOG=info cargo run +RUST_LOG=info cargo run --release # RUST_LOG=info cargo run --config 'target."cfg(all())".runner="sudo -E"'