diff --git a/src/install.sh b/src/install.sh new file mode 100644 index 0000000..8b74842 --- /dev/null +++ b/src/install.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +echo "compiling ferrisfind..." +if cargo build --release; then + echo "ferrisfind compiled successfully!" + if ldd ./target/release/hyphae_cmd | grep libtika | grep "not found"; then + echo "libtika not found in current sytem, copying from the build directory." + echo "this will need sudo privileges..." + if sudo cp ./target/release/build/extractous-*/out/libs/libtika_native.so /usr/lib64/; then + echo "library copied successfully!" + else + echo "error copying libary!" + exit + fi + fi + echo "copying binary to your path..." + echo "this will require sudo privileges" + if sudo cp ./target/release/hyphae_cmd /usr/bin; then + echo "install complete!" + else + echo "error copying binary!" + fi +else + echo "error compiling biary! make sure rust and cargo are installed and setup!" +fi diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..4274654 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,245 @@ +use clap::Parser; +use extractous::Extractor; +use rayon::iter::IntoParallelRefMutIterator; +use rayon::{ThreadPoolBuilder, iter::ParallelIterator}; +use reqwest::blocking::get; +use serpapi::serpapi::Client; +use std::fs::File; +use std::io::Write; +use std::{collections::HashMap, fs::create_dir_all, path::PathBuf}; +use tokio; +use urlencoding::decode; + +#[derive(Parser, Debug)] +#[command( + version, + about, + long_about = "Public file finder and metadata extractor" +)] +struct Args { + ///company name + #[arg(short, long)] + name: String, + + ///serpapi key + #[arg(short, long)] + key: String, + + ///number of pages of results to get (note each page is a credit on your sperapi key, defaults to 10) + #[arg(short, long)] + pages: Option, + + ///the number of threads to use, will default to the rayon default + #[arg(short, long)] + threads: Option, + + ///folder to download found files into defaults to ./found_files + #[arg(short, long)] + download_path: Option, + + ///save metadata as csv + #[arg(short, long)] + csv: Option, + + ///save metadata as csv + #[arg(short, long)] + markdown: Option, +} + +struct FoundFile { + filename: String, + url: String, + filepath: PathBuf, + downloaded: Option, + parsed: Option, + metadata: HashMap, +} + +impl FoundFile { + fn download(&mut self) { + println!("downloading {}...", self.filename); + if let Ok(res) = get(&self.url) { + if res.status().is_success() { + if let Ok(data) = res.bytes() { + if let Ok(mut local_file) = File::create(&self.filepath) { + if let Err(e) = local_file.write(&data) { + eprintln!("Error downloading file! {e}"); + } else { + self.downloaded = Some(true); + println!("{} Downloaded!", self.filename); + return; + } + } + } + } + } + self.downloaded = Some(false); + } + + fn parse(&mut self) { + println!("parsing {}", self.filename); + let extractor = Extractor::new(); + if let Ok((_content, metadata)) = + extractor.extract_file(&self.filepath.display().to_string()) + { + if !metadata.is_empty() { + for (key, value) in metadata { + self.metadata.insert(key, value.join(", ")); + } + } + } + self.parsed = Some(true); + println!("{} parsed!", self.filename); + } +} + +#[tokio::main] +async fn main() { + println!("welcome to ferrisfind!"); + let args = Args::parse(); + if let Some(threads) = args.threads { + ThreadPoolBuilder::new() + .num_threads(threads) + .build_global() + .unwrap(); + } + let mut pages = Vec::new(); + if let Some(given_pages) = args.pages { + for page in 0..given_pages { + pages.push(page + 10); + } + } else { + for page in 0..10 { + pages.push(page + 10); + } + } + let mut download_path = PathBuf::from("found_files"); + if let Some(path) = args.download_path { + download_path = path; + } + if let Err(e) = create_dir_all(&download_path) { + println!("error creating download files directory!\n{e}"); + } + let mut settings = HashMap::new(); + settings.insert("api_key".to_string(), args.key); + settings.insert("engine".to_string(), "google".to_string()); + let client = Client::new(settings).unwrap(); + let mut params = HashMap::new(); + params.insert("q".to_string(), format!("Filetype: PDF \"{}\"", args.name)); + params.insert("h1".to_string(), "en".to_string()); + params.insert("g1".to_string(), "us".to_string()); + params.insert("google_domain".to_string(), "google.com".to_string()); + params.insert("start".to_string(), "0".to_string()); + let mut files = Vec::new(); + println!("beginning search"); + for page in pages { + params.insert("start".to_string(), page.to_string()); + if let Ok(results) = client.search(params.clone()).await { + if let Some(arr) = results["organic_results"].as_array() { + for item in arr { + if let Some(link) = item["link"].as_str() { + if link.contains(".pdf") { + let link_vec: Vec<&str> = link.split("/").collect(); + let link = link.to_string(); + if let Some(filename) = link_vec.last() { + let encoded_filename = filename.to_string(); + let filename = decode(&encoded_filename).unwrap().to_string(); + let mut filepath = download_path.clone(); + filepath.push(&filename); + let new_file = FoundFile { + filename: filename, + filepath, + url: link, + downloaded: None, + parsed: None, + metadata: HashMap::new(), + }; + files.push(new_file); + } + } + } + } + } + } + } + println!("Search finished!"); + println!("downloding files..."); + files.par_iter_mut().for_each(|file| { + file.download(); + if file.downloaded == Some(true) { + file.parse(); + } + }); + for file in &files { + if file.parsed == Some(true) { + println!("_______________________________________________________________"); + println!("# {}", file.filename); + println!("| Type | Data |"); + println!("| ---- | ---- |"); + println!("| filename | {} |", file.filename); + println!("| url | {} |", file.url); + for key in file.metadata.keys() { + println!("| {} | {} |", key, file.metadata.get(key).unwrap()); + } + println!("________________________________________________________________"); + } + } + if let Some(csv_path) = args.csv { + let mut columns = Vec::new(); + let mut rows = Vec::new(); + columns.push(String::from("file_name")); + columns.push(String::from("url")); + for file in &files { + if file.parsed == Some(true) { + let mut row = vec![file.filename.clone(), file.url.clone()]; + for key in file.metadata.keys() { + if !columns.contains(key) { + columns.push(key.clone()); + } + } + for key in &columns { + if let Some(data) = file.metadata.get(key) { + row.push(data.clone()); + } else { + row.push(String::new()); + } + } + rows.push(row.join(",")); + } + } + if let Ok(mut csv_file) = File::create(csv_path) { + let mut out_string = columns.join(","); + out_string.push('\n'); + out_string.push_str(&rows.join("\n")); + if let Err(e) = csv_file.write(out_string.as_bytes()) { + eprintln!("error writing csv file {e}"); + } + } + } + if let Some(mdpath) = args.markdown { + let mut out_string = String::new(); + for file in &files { + if file.parsed == Some(true) { + out_string + .push_str("_______________________________________________________________\n"); + out_string.push_str(&format!("# {}\n", file.filename)); + out_string.push_str("| Type | Data |\n"); + out_string.push_str("| ---- | ---- |\n"); + out_string.push_str(&format!("| filename | {} |\n", file.filename)); + out_string.push_str(&format!("| url | {} |\n", file.url)); + for key in file.metadata.keys() { + out_string.push_str(&format!( + "| {} | {} |\n", + key, + file.metadata.get(key).unwrap() + )); + } + out_string + .push_str("________________________________________________________________\n"); + } + } + if let Ok(mut md) = File::create(mdpath) { + md.write(out_string.as_bytes()).unwrap(); + } + } +}