switched hashmap to fxhash

This commit is contained in:
Ishan Jain 2025-01-20 07:23:22 +05:30
parent 7883b25a8b
commit 8fd082dfe6
5 changed files with 277 additions and 160 deletions

16
Cargo.lock generated
View File

@ -199,6 +199,12 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]] [[package]]
name = "bytes" name = "bytes"
version = "1.9.0" version = "1.9.0"
@ -346,6 +352,15 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]] [[package]]
name = "geofw" name = "geofw"
version = "0.1.0" version = "0.1.0"
@ -356,6 +371,7 @@ dependencies = [
"aya-log", "aya-log",
"clap", "clap",
"env_logger", "env_logger",
"fxhash",
"geofw-common", "geofw-common",
"geofw-ebpf", "geofw-ebpf",
"libc", "libc",

View File

@ -17,6 +17,7 @@ tokio = { workspace = true, features = ["macros", "rt", "rt-multi-thread", "net"
clap = { workspace = true, features = ["derive"] } clap = { workspace = true, features = ["derive"] }
mio = "1.0.3" mio = "1.0.3"
maxminddb = "0.24.0" maxminddb = "0.24.0"
fxhash = "0.2.1"
[build-dependencies] [build-dependencies]
anyhow = { workspace = true } anyhow = { workspace = true }
aya-build = { workspace = true } aya-build = { workspace = true }

View File

@ -21,7 +21,39 @@ struct Opt {
async fn main() -> anyhow::Result<()> { async fn main() -> anyhow::Result<()> {
let maxmind_db = maxmind::MaxmindDB::new("./geofw/GeoLite2-City.mmdb"); let maxmind_db = maxmind::MaxmindDB::new("./geofw/GeoLite2-City.mmdb");
println!("{:?}", maxmind_db); println!("{:?}", maxmind_db);
maxmind_db.read_binary_search_tree(); println!(
"{:?}",
maxmind_db.lookup(IpAddr::V6(Ipv6Addr::new(
// 0x2c0f, 0xfe30, 0x4000, 0, 0, 0, 0, 0,
0x2a0a, 0x6040, 0x4004, 0x10, 0, 0, 0, 0,
)))
);
println!(
"{} {}",
maxmind_db.metadata.data_section_start,
maxmind_db.data.len()
);
// maxmind_db.read_binary_tree(0, 0);
let maxmind_db = maxmind::MaxmindDB::new("./geofw/GeoLite2-ASN.mmdb");
println!("{:?}", maxmind_db);
println!(
"{:?}",
maxmind_db.lookup(IpAddr::V6(Ipv6Addr::new(
// 0x2c0f, 0xfe30, 0x4000, 0, 0, 0, 0, 0,
0x2a0a, 0x6040, 0x4004, 0x10, 0, 0, 0, 0,
)))
);
println!(
"{} {}",
maxmind_db.metadata.data_section_start,
maxmind_db.data.len()
);
maxmind_db.read_binary_tree(0, 0);
return Ok(()); return Ok(());
let opt = Opt::parse(); let opt = Opt::parse();

View File

@ -1,10 +1,10 @@
use fxhash::FxHashMap;
use std::{ use std::{
cmp::Ordering,
collections::HashMap,
fmt::{Debug, Formatter, Result as FmtResult}, fmt::{Debug, Formatter, Result as FmtResult},
fs::File, fs::File,
io::Read, io::Read,
net::Ipv4Addr, net::IpAddr,
ops::Range,
}; };
const METADATA_SECTION_START: &[u8] = &[ const METADATA_SECTION_START: &[u8] = &[
@ -12,27 +12,25 @@ const METADATA_SECTION_START: &[u8] = &[
]; ];
pub struct MaxmindDB { pub struct MaxmindDB {
metadata: Metadata, pub metadata: Metadata,
data: Vec<u8>, pub data: Vec<u8>,
} }
#[derive(Debug, Default)] #[derive(Debug, Default)]
struct Metadata { pub struct Metadata {
node_count: u32, node_count: u32,
record_size: u16, record_size: u16,
binary_tree_section_start: usize, pub data_section_start: usize,
data_section_start: usize,
metadata_section_start: usize,
} }
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, PartialEq, Clone)]
enum Data { pub enum Data {
String(String), String(Range<usize>),
Double(f64), Double(f64),
Bytes(Vec<u8>), Bytes(Vec<u8>),
U16(u16), U16(u16),
U32(u32), U32(u32),
Map(HashMap<String, Data>), Map(FxHashMap<String, Data>),
I32(i32), I32(i32),
U64(u64), U64(u64),
U128(u128), U128(u128),
@ -70,17 +68,15 @@ impl MaxmindDB {
}; };
let m = db.read_metadata(metadata_start); let m = db.read_metadata(metadata_start);
println!("metadata = {:?}", m);
let Data::U16(record_size) = *m.get("record_size").unwrap() else { let Data::U16(record_size) = *m.get("record_size").unwrap() else {
unreachable!() unreachable!()
}; };
let Data::U32(node_count) = *m.get("node_count").unwrap() else { let Data::U32(node_count) = *m.get("node_count").unwrap() else {
unreachable!() unreachable!()
}; };
db.metadata = Metadata { db.metadata = Metadata {
binary_tree_section_start: 0,
data_section_start: ((record_size as usize * 2) / 8) * node_count as usize + 16, data_section_start: ((record_size as usize * 2) / 8) * node_count as usize + 16,
metadata_section_start: metadata_start,
record_size, record_size,
node_count, node_count,
}; };
@ -88,196 +84,268 @@ impl MaxmindDB {
db db
} }
fn read_metadata(&self, metadata_start: usize) -> HashMap<String, Data> { fn read_metadata(&self, metadata_start: usize) -> FxHashMap<String, Data> {
let (Data::Map(map), _) = self.read_data(metadata_start) else { let (Data::Map(map), _) = self.read_data(metadata_start) else {
unreachable!() unreachable!()
}; };
map map
} }
pub fn read_binary_search_tree(&self) { fn node_from_bytes(n: &[u8], bit: u128, record_size: u16) -> u32 {
// Only support 28bit format for now match record_size {
assert_eq!(self.metadata.record_size, 28); 28 => {
if bit == 0 {
u32::from_be_bytes([(n[3] & 0b1111_0000) >> 4, n[0], n[1], n[2]])
} else {
u32::from_be_bytes([n[3] & 0b0000_1111, n[4], n[5], n[6]])
}
}
24 => {
if bit == 0 {
u32::from_be_bytes([0, n[0], n[1], n[2]])
} else {
u32::from_be_bytes([0, n[3], n[4], n[5]])
}
}
_ => unreachable!(),
}
}
pub fn lookup(&self, addr: IpAddr) -> Option<Data> {
let node_size = self.metadata.record_size as usize * 2 / 8; let node_size = self.metadata.record_size as usize * 2 / 8;
let mut node = 96; let mut node = 0;
let mut ip = Ipv4Addr::new(139, 84, 164, 110).to_bits(); let mut ip = match addr {
IpAddr::V4(a) => a.to_bits() as u128,
IpAddr::V6(a) => a.to_bits(),
};
let mut i = 0; let mut i = 0;
while i < 32 && node < self.metadata.node_count { while i < 128 && node < self.metadata.node_count {
let bit = ip & 0x80000000; let bit = ip & (1 << 127);
ip <<= 1; ip <<= 1;
let n = &self.data[node as usize * node_size..(node as usize * node_size) + node_size]; let n = &self.data[node as usize * node_size..(node as usize * node_size) + node_size];
node = if bit == 0 { node = Self::node_from_bytes(n, bit, self.metadata.record_size);
u32::from_be_bytes([n[3] & 0b1111_0000, n[0], n[1], n[2]])
} else {
u32::from_be_bytes([n[3] & 0b0000_1111, n[4], n[5], n[6]])
};
i += 1; i += 1;
} }
if node == self.metadata.node_count { if node == self.metadata.node_count {
println!("not found!"); None
} else { } else {
let data_section_offset = node - self.metadata.node_count; let data_section_offset = node - self.metadata.node_count;
let data = self let (data, _) = self
.read_data(self.metadata.data_section_start + data_section_offset as usize - 16); .read_data(self.metadata.data_section_start + data_section_offset as usize - 16);
println!("{:?}", data); Some(data)
} }
} }
pub fn read_binary_tree(&self, node: u32, position: usize) {
let mut stack = vec![];
let node_size = self.metadata.record_size as usize * 2 / 8;
let mut count = 0;
stack.push((node, position));
while let Some((node, position)) = stack.pop() {
let n = &self.data[node as usize * node_size..(node as usize * node_size) + node_size];
let node_1 = Self::node_from_bytes(n, 0, self.metadata.record_size);
let node_2 = Self::node_from_bytes(n, 1, self.metadata.record_size);
if position < 128 && node_1 < self.metadata.node_count {
stack.push((node_1, position + 1));
}
if position < 128 && node_2 < self.metadata.node_count {
stack.push((node_2, position + 1));
}
if node_1 > self.metadata.node_count {
let data_section_offset = node_1 - self.metadata.node_count;
let data = self.read_data(
self.metadata.data_section_start + data_section_offset as usize - 16,
);
count += 1;
}
if node_2 > self.metadata.node_count {
let data_section_offset = node_2 - self.metadata.node_count;
let data = self.read_data(
self.metadata.data_section_start + data_section_offset as usize - 16,
);
count += 1;
}
}
println!("count = {}", count);
}
fn read_data(&self, read_offset: usize) -> (Data, usize) { fn read_data(&self, read_offset: usize) -> (Data, usize) {
// println!("read offset: {}", read_offset);
let data = &self.data[read_offset..]; let data = &self.data[read_offset..];
let (data_type, length, read) = Self::read_data_meta(data); let (data_type, length, read) = Self::read_data_meta(data);
// println!("{} {} {}", data_type, length, read);
match data_type { match data_type {
1 => { 1 => self.follow_pointer(read_offset),
// println!("read = {:?}", read + read_offset);
// let data = &data[read..];
let s = (data[0] >> 3) & 0x3;
let v = data[0] & 0b0000_0111;
let pointer = match s {
0 => u32::from_be_bytes([0, 0, v, data[1]]),
1 => u32::from_be_bytes([0, v, data[1], data[2]]) + 2048,
2 => u32::from_be_bytes([v, data[1], data[2], data[3]]) + 526336,
3 => u32::from_be_bytes([data[1], data[2], data[3], data[4]]),
_ => unreachable!(),
};
let (data, _) = self.read_data(self.metadata.data_section_start + pointer as usize);
(data, s as usize + 1 + 1)
}
2 => { 2 => {
let value = &data[read..read + length]; // let value = String::from_utf8_lossy(&data[read..read + length]);
( (
Data::String(String::from_utf8_lossy(value).to_string()), Data::String(read_offset + read..read_offset + read + length),
read + length, read + length,
) )
} }
3 => { 3 => {
assert_eq!(length, 8); assert_eq!(length, 8);
let s = &data[read..read + length];
let num = f64::from_be_bytes([s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]]);
(Data::Double(num), read + length) (Self::read_float::<8>(data), read + length)
} }
4 => { 4 => {
todo!("reached data field???"); todo!("reached data field???");
} }
5 => { 5 => (self.read_u16(read_offset + read, length), read + length),
let slice = &data[read..read + length]; 6 => (self.read_u32(read_offset + read, length), read + length),
let number = match *slice { 7 => self.read_map(read_offset, read, length),
[] => 0, 8 => (self.read_i32(read_offset + read, length), read + length),
[a] => a as u16, 9 => (self.read_u64(read_offset + read, length), read + length),
[a, b] => (a as u16) << 8 | b as u16, 10 => (self.read_u128(read_offset + read, length), read + length),
_ => unreachable!(), 11 => self.read_array(read_offset, read, length),
};
(Data::U16(number), read + length)
}
6 => {
let slice = &data[read..read + length];
let number = match *slice {
[] => 0,
[a] => a as u32,
[a, b] => (a as u32) << 8 | b as u32,
[a, b, c] => (a as u32) << 16 | (b as u32) << 8 | c as u32,
[a, b, c, d] => {
(a as u32) << 24 | (b as u32) << 16 | (c as u32) << 8 | d as u32
}
_ => unreachable!(),
};
(Data::U32(number), read + length)
}
7 => {
let mut map = HashMap::with_capacity(length);
// length is number of elements
let mut length = length;
let mut read = read;
while length > 0 {
let (key, r) = self.read_data(read_offset + read);
read += r;
let (value, r) = self.read_data(read_offset + read);
read += r;
let Data::String(key) = key else {
unreachable!()
};
map.insert(key, value);
length -= 1;
}
(Data::Map(map), read)
}
8 => {
let slice = &data[read..read + length];
let number = match *slice {
[] => 0,
[a] => a as i32,
[a, b] => (a as i32) << 8 | b as i32,
[a, b, c] => (a as i32) << 16 | (b as i32) << 8 | c as i32,
[a, b, c, d] => {
(a as i32) << 24 | (b as i32) << 16 | (c as i32) << 8 | d as i32
}
_ => unreachable!(),
};
(Data::I32(number), read + length)
}
9 => {
let slice = &data[read..read + length];
let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| {
acc | ((byte as u64) << (8 * (slice.len() - i - 1)))
});
(Data::U64(number), read + length)
}
10 => {
let slice = &data[read..read + length];
let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| {
acc | ((byte as u128) << (8 * (slice.len() - i - 1)))
});
(Data::U128(number), read + length)
}
11 => {
let mut read = read;
let mut out = vec![];
let mut length = length;
while length > 0 {
let (value, r) = self.read_data(read_offset + read);
read += r;
length -= 1;
out.push(value);
}
(Data::Array(out), read)
}
12 => { 12 => {
todo!("reached data cache container"); todo!("reached data cache container");
} }
13 => (Data::End, read_offset), 13 => (Data::End, read_offset + read),
14 => { 14 => (Data::Boolean(length == 1), read),
todo!("reached boolean");
}
15 => { 15 => {
assert_eq!(length, 4); assert_eq!(length, 4);
let s = &data[read..read + length];
let num = f32::from_be_bytes([s[0], s[1], s[2], s[3]]);
(Data::Float(num), read + length) (Self::read_float::<4>(data), read + length)
}
_ => unreachable!(),
}
}
fn read_map(&self, offset: usize, read: usize, length: usize) -> (Data, usize) {
let mut map = FxHashMap::with_capacity_and_hasher(length, Default::default());
// length is number of elements
let mut length = length;
let mut read = read;
while length > 0 {
let (key, r) = self.read_data(offset + read);
read += r;
let (value, r) = self.read_data(offset + read);
read += r;
let Data::String(key) = key else {
unreachable!()
};
let key = String::from_utf8_lossy(&self.data[key]);
map.insert(key.to_string(), value);
length -= 1;
}
(Data::Map(map), read)
}
fn read_array(&self, offset: usize, read: usize, length: usize) -> (Data, usize) {
let mut read = read;
let mut out = vec![];
let mut length = length;
while length > 0 {
let (value, r) = self.read_data(offset + read);
read += r;
length -= 1;
out.push(value);
}
(Data::Array(out), read)
}
fn read_u16(&self, offset: usize, length: usize) -> Data {
let slice = &self.data[offset..offset + length];
let number = match *slice {
[] => 0,
[a] => a as u16,
[a, b] => (a as u16) << 8 | b as u16,
_ => unreachable!(),
};
Data::U16(number)
}
fn read_i32(&self, offset: usize, length: usize) -> Data {
let slice = &self.data[offset..offset + length];
let number = match *slice {
[] => 0,
[a] => a as i32,
[a, b] => (a as i32) << 8 | b as i32,
[a, b, c] => (a as i32) << 16 | (b as i32) << 8 | c as i32,
[a, b, c, d] => (a as i32) << 24 | (b as i32) << 16 | (c as i32) << 8 | d as i32,
_ => unreachable!(),
};
Data::I32(number)
}
fn read_u32(&self, offset: usize, length: usize) -> Data {
let slice = &self.data[offset..offset + length];
let number = match *slice {
[] => 0,
[a] => a as u32,
[a, b] => (a as u32) << 8 | b as u32,
[a, b, c] => (a as u32) << 16 | (b as u32) << 8 | c as u32,
[a, b, c, d] => (a as u32) << 24 | (b as u32) << 16 | (c as u32) << 8 | d as u32,
_ => unreachable!(),
};
Data::U32(number)
}
fn read_u64(&self, offset: usize, length: usize) -> Data {
let slice = &self.data[offset..offset + length];
let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| {
acc | ((byte as u64) << (8 * (slice.len() - i - 1)))
});
Data::U64(number)
}
fn read_u128(&self, offset: usize, length: usize) -> Data {
let slice = &self.data[offset..offset + length];
let number = slice.iter().enumerate().fold(0, |acc, (i, &byte)| {
acc | ((byte as u128) << (8 * (slice.len() - i - 1)))
});
Data::U128(number)
}
fn follow_pointer(&self, offset: usize) -> (Data, usize) {
let data = &self.data[offset..];
let s = (data[0] >> 3) & 0x3;
let v = data[0] & 0b0000_0111;
let pointer = match s {
0 => u32::from_be_bytes([0, 0, v, data[1]]),
1 => u32::from_be_bytes([0, v, data[1], data[2]]) + 2048,
2 => u32::from_be_bytes([v, data[1], data[2], data[3]]) + 526336,
3 => u32::from_be_bytes([data[1], data[2], data[3], data[4]]),
_ => unreachable!(),
};
let (data, _) = self.read_data(self.metadata.data_section_start + pointer as usize);
(data, s as usize + 1 + 1)
}
fn read_float<const T: usize>(d: &[u8]) -> Data {
match T {
4 => {
let num = f32::from_be_bytes([d[0], d[1], d[2], d[3]]);
Data::Float(num)
}
8 => {
let num = f64::from_be_bytes([d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]]);
Data::Double(num)
} }
_ => unreachable!(), _ => unreachable!(),
} }

2
run.sh
View File

@ -3,6 +3,6 @@
set -e set -e
set -o pipefail set -o pipefail
RUST_LOG=info cargo run RUST_LOG=info cargo run --release
# RUST_LOG=info cargo run --config 'target."cfg(all())".runner="sudo -E"' # RUST_LOG=info cargo run --config 'target."cfg(all())".runner="sudo -E"'