Create basic, functional git dumper

This commit is contained in:
HoLLy 2022-09-17 15:51:44 +02:00
Коммит 17ef6107f1
9 изменённых файлов: 1491 добавлений и 0 удалений

1
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
/target

1073
Cargo.lock сгенерированный Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

14
Cargo.toml Normal file
Просмотреть файл

@ -0,0 +1,14 @@
[package]
name = "git-dumper"
version = "0.1.0"
edition = "2021"
description = "A tool to dump exposed .git directories"
[dependencies]
anyhow = "1.0.65"
clap = { version = "3.2", features = ["cargo"] }
lazy_static = "1.4.0"
miniz_oxide = "0.6.2"
regex = "1.6"
reqwest = "0.11"
tokio = { version = "1.21", features = ["net", "sync", "rt", "rt-multi-thread", "macros"] }

194
src/dump_git.rs Normal file
Просмотреть файл

@ -0,0 +1,194 @@
use std::{collections::HashSet, path::PathBuf};
use regex::Regex;
use reqwest::StatusCode;
use tokio::sync::mpsc::{self, Sender};
use crate::git_parsing::{parse_hash, parse_head, parse_object, GitObject};
lazy_static::lazy_static! {
static ref REGEX_OBJECT_PATH: Regex = Regex::new(r"[\da-f]{2}/[\da-f]{38}").unwrap();
}
const START_FILES: &[&str] = &[
"info/exclude",
"logs/HEAD",
"objects/info/packs", // TODO: this does not seem to be present anymore?
"config",
"COMMIT_EDITMSG",
"description",
"FETCH_HEAD",
"HEAD",
"index",
"ORIG_HEAD",
"packed-refs",
];
// TODO: brute-force files based on known and unknown branch names
#[derive(Default)]
struct DownloadCache {
/// The URL of the exposed .git directory
base_url: String,
/// The local path to download the git repo to
base_path: PathBuf,
cache: HashSet<String>,
}
impl DownloadCache {
fn new(base_url: String, base_path: PathBuf) -> Self {
Self {
base_url,
base_path,
cache: HashSet::new(),
}
}
/// Downloads a file if it hasn't been downloaded before and sends it to the given channel.
fn download(&mut self, file_name: &str, sender: Sender<DownloadedFile>) {
if self.cache.contains(file_name) {
// println!("Skipping download of file {file_name} as it's already downloaded");
return;
}
self.cache.insert(file_name.into());
let url = format!("{}{file_name}", self.base_url);
let file_name = file_name.into();
tokio::spawn(async move {
let got = reqwest::get(&url).await;
match got {
Ok(resp) => {
if resp.status() == StatusCode::OK {
let bytes = resp.bytes().await.unwrap(); // TODO: ugh, fix this unwrap
sender
.send(DownloadedFile {
name: file_name,
content: bytes.to_vec(),
sender: sender.clone(),
})
.await
.unwrap();
} else {
println!(
"Error while trying to download {url}: status code is {}",
resp.status()
);
}
}
Err(e) => {
println!("Error while trying to download {url}: {e}");
}
}
});
}
fn download_object(&mut self, object_hash: &str, sender: Sender<DownloadedFile>) {
let hash_start = &object_hash[0..2];
let hash_end = &object_hash[2..];
let path = format!("objects/{hash_start}/{hash_end}");
self.download(&path, sender)
}
}
#[derive(Debug)]
struct DownloadedFile {
pub name: String,
pub content: Vec<u8>,
pub sender: Sender<DownloadedFile>,
}
pub async fn download_all(base_url: String, base_path: PathBuf) {
let mut cache = DownloadCache::new(base_url, base_path);
// TODO: try out unbounded channel too
// TODO: maybe just have a cli option that determines the limit of concurrent downloads instead?
let (tx, mut rx) = mpsc::channel(32);
for &file in START_FILES {
let new_tx = tx.clone();
cache.download(file, new_tx);
}
// drop the sender object so all senders can be out of scope by the end of the download
drop(tx);
// every time we downloaded a new file, see what other files we can derive from it
while let Some(message) = rx.recv().await {
// write this file to disk
let path = cache.base_path.join(".git").join(&message.name);
let path_parent = path
.parent()
.expect("There should be at least .git as parent");
std::fs::create_dir_all(path_parent).unwrap_or_else(|e| {
println!(
"Error while trying to create directory {}: {:?}",
path_parent.to_string_lossy(),
e
);
});
std::fs::write(path, &message.content).unwrap_or_else(|e| {
println!(
"Error while trying to write {} to disk: {:?}",
message.name, e
);
});
println!(
"Downloaded '{}' ({} bytes)",
message.name,
message.content.len()
);
match message.name.as_str() {
"HEAD" => match parse_head(&message.content) {
Ok(ref_path) => {
println!("\tFound ref path {ref_path}");
cache.download(ref_path, message.sender.clone());
}
Err(err) => todo!("{:?}", err),
},
"ORIG_HEAD" => match parse_hash(&message.content) {
Ok(hash) => {
println!("\tFound object hash {hash}");
cache.download_object(hash, message.sender.clone());
}
Err(err) => todo!("{:?}", err),
},
n if n.starts_with("refs/heads/") => match parse_hash(&message.content) {
Ok(hash) => {
println!("\tFound object hash {hash}");
let hash_start = &hash[0..2];
let hash_end = &hash[2..];
let path = format!("objects/{hash_start}/{hash_end}");
cache.download(&path, message.sender.clone());
}
Err(err) => todo!("{:?}", err),
},
n if n.starts_with("objects/") && REGEX_OBJECT_PATH.is_match(n) => {
match parse_object(&message.content) {
Ok(GitObject::Blob) => {
println!("\tFound blob object");
}
Ok(GitObject::Tree(hashes)) => {
println!("\tFound tree object with {} hashes", hashes.len());
for hash in hashes {
cache.download_object(&hash, message.sender.clone());
}
}
Ok(GitObject::Commit(hashes)) => {
println!("\tFound commit object with {} hashes", hashes.len());
for hash in hashes {
cache.download_object(&hash, message.sender.clone());
}
}
Err(err) => todo!("{:?}", err),
}
}
n => {
println!("\tNot using file '{n}' for anything right now");
}
}
}
}

172
src/git_parsing.rs Normal file
Просмотреть файл

@ -0,0 +1,172 @@
use anyhow::{anyhow, bail, Result};
use miniz_oxide::inflate::TINFLStatus;
use regex::Regex;
use std::fmt::Write;
lazy_static::lazy_static! {
static ref REGEX_HASH: Regex = Regex::new(r"[a-f\d]{40}").unwrap();
static ref REGEX_REFS_PATH: Regex = Regex::new(r"refs/heads/(\S+)").unwrap();
}
pub enum GitObject {
Tree(Vec<String>),
Commit(Vec<String>),
Blob,
}
pub fn parse_head(data: &[u8]) -> Result<&str> {
let content = std::str::from_utf8(data)?;
if !content.starts_with("ref: ") {
bail!("HEAD file must start with \"ref: \"");
}
let content = (&content[5..]).trim_end();
if !REGEX_REFS_PATH.is_match(content) {
bail!("Failed to match refs path in HEAD file");
}
Ok(content)
}
pub fn parse_hash(data: &[u8]) -> Result<&str> {
let content = std::str::from_utf8(data)?;
let content = content.trim_end();
if !REGEX_HASH.is_match(content) {
bail!("Failed to match hash");
}
Ok(content)
}
pub fn parse_object(data: &[u8]) -> Result<GitObject> {
let peek = peek_object_type(data)?;
match peek {
[b'b', b'l', b'o', b'b', _, _] => Ok(GitObject::Blob),
[b't', b'r', b'e', b'e', _, _] => {
let decompressed = miniz_oxide::inflate::decompress_to_vec_zlib(data)
.map_err(|e| anyhow!("Problem while decompressing git object: {}", e))?;
let decompressed = decompressed.as_slice();
let mut hashes = vec![];
// TODO: this is ugly, use a slice-based approach instead
let mut decompressed_iter = split_object_at_zero(decompressed)?.iter().peekable();
while decompressed_iter.peek().is_some() {
let bytes: Vec<u8> = (&mut decompressed_iter)
.skip_while(|&&b| b != b'\0')
.skip(1)
.take(0x14)
.cloned()
.collect();
hashes.push(slice_to_hex(&bytes));
}
Ok(GitObject::Tree(hashes))
}
[b'c', b'o', b'm', b'm', b'i', b't'] => {
let decompressed = miniz_oxide::inflate::decompress_to_vec_zlib(data)
.map_err(|e| anyhow!("Problem while decompressing git object: {}", e))?;
let decompressed = split_object_at_zero(&decompressed)?;
let commit_message = String::from_utf8_lossy(decompressed);
let hashes = commit_message
.lines()
.take_while(|&line| !line.trim().is_empty())
.filter_map(|line| match line.split_once(' ') {
Some(("tree", hash)) => Some(hash.into()),
Some(("parent", hash)) => Some(hash.into()),
_ => None,
})
.collect();
Ok(GitObject::Commit(hashes))
}
_ => bail!(
"Unknown git object header: {}",
String::from_utf8_lossy(&peek)
),
}
}
fn peek_object_type(data: &[u8]) -> Result<[u8; 6]> {
let mut array = [0u8; 6];
match miniz_oxide::inflate::decompress_slice_iter_to_slice(
&mut array,
data.chunks(16),
true,
true,
) {
Ok(_) | Err(TINFLStatus::HasMoreOutput) => Ok(array),
Err(e) => bail!("Error while decompressing object file: {:?}", e),
}
}
fn split_object_at_zero(data: &[u8]) -> Result<&[u8]> {
let idx_zero = data
.iter()
.enumerate()
.find(|(_, &val)| val == b'\0')
.map(|(idx, _)| idx)
.ok_or_else(|| anyhow!("Malformed object file, could not find null separator"))?;
let data = &data[idx_zero + 1..];
Ok(data)
}
fn slice_to_hex(data: &[u8]) -> String {
let mut s = String::with_capacity(data.len() * 2);
for byte in data {
write!(s, "{:02x}", byte).expect("writing hex should not fail");
}
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_commit_blob() {
let bytes = include_bytes!("../test-data/object-blob");
let parsed = parse_object(bytes).unwrap();
assert!(matches!(parsed, GitObject::Blob));
}
#[test]
fn parse_tree_object() {
let bytes = include_bytes!("../test-data/object-tree");
let parsed = parse_object(bytes).unwrap();
assert!(matches!(parsed, GitObject::Tree(_)));
if let GitObject::Tree(vec) = parsed {
assert_eq!(
vec,
vec![
"93748a31e8df89b80ab5ebe4ad19ea62899a28fa".to_string(),
"920512d27e4df0c79ca4a929bc5d4254b3d05c4c".to_string(),
"f5463e0d810357c84bdb956dcfe70b8015d6fb24".to_string(),
]
);
}
}
#[test]
fn parse_commit_object() {
let bytes = include_bytes!("../test-data/object-commit");
let parsed = parse_object(bytes).unwrap();
assert!(matches!(parsed, GitObject::Commit(_)));
if let GitObject::Commit(vec) = parsed {
assert_eq!(
vec,
vec![
"faf660b3b793f359495ad23ea2c449da6b3b64a0".to_string(),
"1712bc7d3a0e6cf9920541e616310bd30f431728".to_string(),
]
);
}
}
}

37
src/main.rs Normal file
Просмотреть файл

@ -0,0 +1,37 @@
use std::path::PathBuf;
use clap::{command, Arg, Command};
mod dump_git;
mod git_parsing;
fn cli() -> Command<'static> {
command!()
.arg(
Arg::new("URL")
.required(true)
.help("The url of the exposed .git directory"),
)
.arg(
Arg::new("PATH")
.required(false)
.help("The directory to download to")
.default_value("git-dumped"),
)
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let matches = cli().get_matches();
let url = matches.get_one::<String>("URL").unwrap();
let path = matches.get_one::<String>("PATH").unwrap();
// println!("URL: {url}");
// println!("PATH: {path}");
std::fs::create_dir_all(format!("{path}/.git/"))?;
dump_git::download_all(url.clone(), PathBuf::from(path)).await;
Ok(())
}

Двоичные данные
test-data/object-blob Normal file

Двоичный файл не отображается.

Двоичные данные
test-data/object-commit Normal file

Двоичный файл не отображается.

Двоичные данные
test-data/object-tree Normal file

Двоичный файл не отображается.