Create basic, functional git dumper
This commit is contained in:
Коммит
17ef6107f1
|
@ -0,0 +1 @@
|
|||
/target
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,14 @@
|
|||
[package]
|
||||
name = "git-dumper"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "A tool to dump exposed .git directories"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.65"
|
||||
clap = { version = "3.2", features = ["cargo"] }
|
||||
lazy_static = "1.4.0"
|
||||
miniz_oxide = "0.6.2"
|
||||
regex = "1.6"
|
||||
reqwest = "0.11"
|
||||
tokio = { version = "1.21", features = ["net", "sync", "rt", "rt-multi-thread", "macros"] }
|
|
@ -0,0 +1,194 @@
|
|||
use std::{collections::HashSet, path::PathBuf};
|
||||
|
||||
use regex::Regex;
|
||||
use reqwest::StatusCode;
|
||||
use tokio::sync::mpsc::{self, Sender};
|
||||
|
||||
use crate::git_parsing::{parse_hash, parse_head, parse_object, GitObject};
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref REGEX_OBJECT_PATH: Regex = Regex::new(r"[\da-f]{2}/[\da-f]{38}").unwrap();
|
||||
}
|
||||
|
||||
const START_FILES: &[&str] = &[
|
||||
"info/exclude",
|
||||
"logs/HEAD",
|
||||
"objects/info/packs", // TODO: this does not seem to be present anymore?
|
||||
"config",
|
||||
"COMMIT_EDITMSG",
|
||||
"description",
|
||||
"FETCH_HEAD",
|
||||
"HEAD",
|
||||
"index",
|
||||
"ORIG_HEAD",
|
||||
"packed-refs",
|
||||
];
|
||||
|
||||
// TODO: brute-force files based on known and unknown branch names
|
||||
|
||||
#[derive(Default)]
|
||||
struct DownloadCache {
|
||||
/// The URL of the exposed .git directory
|
||||
base_url: String,
|
||||
/// The local path to download the git repo to
|
||||
base_path: PathBuf,
|
||||
cache: HashSet<String>,
|
||||
}
|
||||
|
||||
impl DownloadCache {
|
||||
fn new(base_url: String, base_path: PathBuf) -> Self {
|
||||
Self {
|
||||
base_url,
|
||||
base_path,
|
||||
cache: HashSet::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Downloads a file if it hasn't been downloaded before and sends it to the given channel.
|
||||
fn download(&mut self, file_name: &str, sender: Sender<DownloadedFile>) {
|
||||
if self.cache.contains(file_name) {
|
||||
// println!("Skipping download of file {file_name} as it's already downloaded");
|
||||
return;
|
||||
}
|
||||
|
||||
self.cache.insert(file_name.into());
|
||||
|
||||
let url = format!("{}{file_name}", self.base_url);
|
||||
let file_name = file_name.into();
|
||||
tokio::spawn(async move {
|
||||
let got = reqwest::get(&url).await;
|
||||
match got {
|
||||
Ok(resp) => {
|
||||
if resp.status() == StatusCode::OK {
|
||||
let bytes = resp.bytes().await.unwrap(); // TODO: ugh, fix this unwrap
|
||||
sender
|
||||
.send(DownloadedFile {
|
||||
name: file_name,
|
||||
content: bytes.to_vec(),
|
||||
sender: sender.clone(),
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
} else {
|
||||
println!(
|
||||
"Error while trying to download {url}: status code is {}",
|
||||
resp.status()
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
println!("Error while trying to download {url}: {e}");
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn download_object(&mut self, object_hash: &str, sender: Sender<DownloadedFile>) {
|
||||
let hash_start = &object_hash[0..2];
|
||||
let hash_end = &object_hash[2..];
|
||||
let path = format!("objects/{hash_start}/{hash_end}");
|
||||
self.download(&path, sender)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct DownloadedFile {
|
||||
pub name: String,
|
||||
pub content: Vec<u8>,
|
||||
pub sender: Sender<DownloadedFile>,
|
||||
}
|
||||
|
||||
pub async fn download_all(base_url: String, base_path: PathBuf) {
|
||||
let mut cache = DownloadCache::new(base_url, base_path);
|
||||
|
||||
// TODO: try out unbounded channel too
|
||||
// TODO: maybe just have a cli option that determines the limit of concurrent downloads instead?
|
||||
let (tx, mut rx) = mpsc::channel(32);
|
||||
|
||||
for &file in START_FILES {
|
||||
let new_tx = tx.clone();
|
||||
cache.download(file, new_tx);
|
||||
}
|
||||
|
||||
// drop the sender object so all senders can be out of scope by the end of the download
|
||||
drop(tx);
|
||||
|
||||
// every time we downloaded a new file, see what other files we can derive from it
|
||||
while let Some(message) = rx.recv().await {
|
||||
// write this file to disk
|
||||
let path = cache.base_path.join(".git").join(&message.name);
|
||||
let path_parent = path
|
||||
.parent()
|
||||
.expect("There should be at least .git as parent");
|
||||
|
||||
std::fs::create_dir_all(path_parent).unwrap_or_else(|e| {
|
||||
println!(
|
||||
"Error while trying to create directory {}: {:?}",
|
||||
path_parent.to_string_lossy(),
|
||||
e
|
||||
);
|
||||
});
|
||||
std::fs::write(path, &message.content).unwrap_or_else(|e| {
|
||||
println!(
|
||||
"Error while trying to write {} to disk: {:?}",
|
||||
message.name, e
|
||||
);
|
||||
});
|
||||
|
||||
println!(
|
||||
"Downloaded '{}' ({} bytes)",
|
||||
message.name,
|
||||
message.content.len()
|
||||
);
|
||||
|
||||
match message.name.as_str() {
|
||||
"HEAD" => match parse_head(&message.content) {
|
||||
Ok(ref_path) => {
|
||||
println!("\tFound ref path {ref_path}");
|
||||
cache.download(ref_path, message.sender.clone());
|
||||
}
|
||||
Err(err) => todo!("{:?}", err),
|
||||
},
|
||||
"ORIG_HEAD" => match parse_hash(&message.content) {
|
||||
Ok(hash) => {
|
||||
println!("\tFound object hash {hash}");
|
||||
cache.download_object(hash, message.sender.clone());
|
||||
}
|
||||
Err(err) => todo!("{:?}", err),
|
||||
},
|
||||
n if n.starts_with("refs/heads/") => match parse_hash(&message.content) {
|
||||
Ok(hash) => {
|
||||
println!("\tFound object hash {hash}");
|
||||
let hash_start = &hash[0..2];
|
||||
let hash_end = &hash[2..];
|
||||
let path = format!("objects/{hash_start}/{hash_end}");
|
||||
cache.download(&path, message.sender.clone());
|
||||
}
|
||||
Err(err) => todo!("{:?}", err),
|
||||
},
|
||||
n if n.starts_with("objects/") && REGEX_OBJECT_PATH.is_match(n) => {
|
||||
match parse_object(&message.content) {
|
||||
Ok(GitObject::Blob) => {
|
||||
println!("\tFound blob object");
|
||||
}
|
||||
Ok(GitObject::Tree(hashes)) => {
|
||||
println!("\tFound tree object with {} hashes", hashes.len());
|
||||
for hash in hashes {
|
||||
cache.download_object(&hash, message.sender.clone());
|
||||
}
|
||||
}
|
||||
Ok(GitObject::Commit(hashes)) => {
|
||||
println!("\tFound commit object with {} hashes", hashes.len());
|
||||
for hash in hashes {
|
||||
cache.download_object(&hash, message.sender.clone());
|
||||
}
|
||||
}
|
||||
Err(err) => todo!("{:?}", err),
|
||||
}
|
||||
}
|
||||
n => {
|
||||
println!("\tNot using file '{n}' for anything right now");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,172 @@
|
|||
use anyhow::{anyhow, bail, Result};
|
||||
use miniz_oxide::inflate::TINFLStatus;
|
||||
use regex::Regex;
|
||||
use std::fmt::Write;
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref REGEX_HASH: Regex = Regex::new(r"[a-f\d]{40}").unwrap();
|
||||
static ref REGEX_REFS_PATH: Regex = Regex::new(r"refs/heads/(\S+)").unwrap();
|
||||
}
|
||||
|
||||
pub enum GitObject {
|
||||
Tree(Vec<String>),
|
||||
Commit(Vec<String>),
|
||||
Blob,
|
||||
}
|
||||
|
||||
pub fn parse_head(data: &[u8]) -> Result<&str> {
|
||||
let content = std::str::from_utf8(data)?;
|
||||
|
||||
if !content.starts_with("ref: ") {
|
||||
bail!("HEAD file must start with \"ref: \"");
|
||||
}
|
||||
|
||||
let content = (&content[5..]).trim_end();
|
||||
|
||||
if !REGEX_REFS_PATH.is_match(content) {
|
||||
bail!("Failed to match refs path in HEAD file");
|
||||
}
|
||||
|
||||
Ok(content)
|
||||
}
|
||||
|
||||
pub fn parse_hash(data: &[u8]) -> Result<&str> {
|
||||
let content = std::str::from_utf8(data)?;
|
||||
let content = content.trim_end();
|
||||
|
||||
if !REGEX_HASH.is_match(content) {
|
||||
bail!("Failed to match hash");
|
||||
}
|
||||
|
||||
Ok(content)
|
||||
}
|
||||
|
||||
pub fn parse_object(data: &[u8]) -> Result<GitObject> {
|
||||
let peek = peek_object_type(data)?;
|
||||
match peek {
|
||||
[b'b', b'l', b'o', b'b', _, _] => Ok(GitObject::Blob),
|
||||
[b't', b'r', b'e', b'e', _, _] => {
|
||||
let decompressed = miniz_oxide::inflate::decompress_to_vec_zlib(data)
|
||||
.map_err(|e| anyhow!("Problem while decompressing git object: {}", e))?;
|
||||
let decompressed = decompressed.as_slice();
|
||||
|
||||
let mut hashes = vec![];
|
||||
|
||||
// TODO: this is ugly, use a slice-based approach instead
|
||||
let mut decompressed_iter = split_object_at_zero(decompressed)?.iter().peekable();
|
||||
while decompressed_iter.peek().is_some() {
|
||||
let bytes: Vec<u8> = (&mut decompressed_iter)
|
||||
.skip_while(|&&b| b != b'\0')
|
||||
.skip(1)
|
||||
.take(0x14)
|
||||
.cloned()
|
||||
.collect();
|
||||
hashes.push(slice_to_hex(&bytes));
|
||||
}
|
||||
|
||||
Ok(GitObject::Tree(hashes))
|
||||
}
|
||||
[b'c', b'o', b'm', b'm', b'i', b't'] => {
|
||||
let decompressed = miniz_oxide::inflate::decompress_to_vec_zlib(data)
|
||||
.map_err(|e| anyhow!("Problem while decompressing git object: {}", e))?;
|
||||
|
||||
let decompressed = split_object_at_zero(&decompressed)?;
|
||||
let commit_message = String::from_utf8_lossy(decompressed);
|
||||
|
||||
let hashes = commit_message
|
||||
.lines()
|
||||
.take_while(|&line| !line.trim().is_empty())
|
||||
.filter_map(|line| match line.split_once(' ') {
|
||||
Some(("tree", hash)) => Some(hash.into()),
|
||||
Some(("parent", hash)) => Some(hash.into()),
|
||||
_ => None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(GitObject::Commit(hashes))
|
||||
}
|
||||
_ => bail!(
|
||||
"Unknown git object header: {}",
|
||||
String::from_utf8_lossy(&peek)
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn peek_object_type(data: &[u8]) -> Result<[u8; 6]> {
|
||||
let mut array = [0u8; 6];
|
||||
match miniz_oxide::inflate::decompress_slice_iter_to_slice(
|
||||
&mut array,
|
||||
data.chunks(16),
|
||||
true,
|
||||
true,
|
||||
) {
|
||||
Ok(_) | Err(TINFLStatus::HasMoreOutput) => Ok(array),
|
||||
Err(e) => bail!("Error while decompressing object file: {:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
fn split_object_at_zero(data: &[u8]) -> Result<&[u8]> {
|
||||
let idx_zero = data
|
||||
.iter()
|
||||
.enumerate()
|
||||
.find(|(_, &val)| val == b'\0')
|
||||
.map(|(idx, _)| idx)
|
||||
.ok_or_else(|| anyhow!("Malformed object file, could not find null separator"))?;
|
||||
let data = &data[idx_zero + 1..];
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
fn slice_to_hex(data: &[u8]) -> String {
|
||||
let mut s = String::with_capacity(data.len() * 2);
|
||||
for byte in data {
|
||||
write!(s, "{:02x}", byte).expect("writing hex should not fail");
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_commit_blob() {
|
||||
let bytes = include_bytes!("../test-data/object-blob");
|
||||
let parsed = parse_object(bytes).unwrap();
|
||||
assert!(matches!(parsed, GitObject::Blob));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_tree_object() {
|
||||
let bytes = include_bytes!("../test-data/object-tree");
|
||||
let parsed = parse_object(bytes).unwrap();
|
||||
assert!(matches!(parsed, GitObject::Tree(_)));
|
||||
|
||||
if let GitObject::Tree(vec) = parsed {
|
||||
assert_eq!(
|
||||
vec,
|
||||
vec![
|
||||
"93748a31e8df89b80ab5ebe4ad19ea62899a28fa".to_string(),
|
||||
"920512d27e4df0c79ca4a929bc5d4254b3d05c4c".to_string(),
|
||||
"f5463e0d810357c84bdb956dcfe70b8015d6fb24".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_commit_object() {
|
||||
let bytes = include_bytes!("../test-data/object-commit");
|
||||
let parsed = parse_object(bytes).unwrap();
|
||||
assert!(matches!(parsed, GitObject::Commit(_)));
|
||||
|
||||
if let GitObject::Commit(vec) = parsed {
|
||||
assert_eq!(
|
||||
vec,
|
||||
vec![
|
||||
"faf660b3b793f359495ad23ea2c449da6b3b64a0".to_string(),
|
||||
"1712bc7d3a0e6cf9920541e616310bd30f431728".to_string(),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use clap::{command, Arg, Command};
|
||||
|
||||
mod dump_git;
|
||||
mod git_parsing;
|
||||
|
||||
fn cli() -> Command<'static> {
|
||||
command!()
|
||||
.arg(
|
||||
Arg::new("URL")
|
||||
.required(true)
|
||||
.help("The url of the exposed .git directory"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("PATH")
|
||||
.required(false)
|
||||
.help("The directory to download to")
|
||||
.default_value("git-dumped"),
|
||||
)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let matches = cli().get_matches();
|
||||
let url = matches.get_one::<String>("URL").unwrap();
|
||||
let path = matches.get_one::<String>("PATH").unwrap();
|
||||
|
||||
// println!("URL: {url}");
|
||||
// println!("PATH: {path}");
|
||||
|
||||
std::fs::create_dir_all(format!("{path}/.git/"))?;
|
||||
|
||||
dump_git::download_all(url.clone(), PathBuf::from(path)).await;
|
||||
|
||||
Ok(())
|
||||
}
|
Двоичный файл не отображается.
Двоичный файл не отображается.
Двоичный файл не отображается.
Загрузка…
Ссылка в новой задаче