5 Commits

3 changed files with 273 additions and 5 deletions
+3 -5
View File
@@ -51,15 +51,13 @@ find PDFs for google and save to both a CSV and Markdown file:
`ferrisfind -k {serpapi key} -n google -c ./metadata.csv -m ./metadata.md` `ferrisfind -k {serpapi key} -n google -c ./metadata.csv -m ./metadata.md`
# Building # intall
clone this repo: clone this repo:
`git clone https://git.pyro.monster/pyro/ferrisfind.git` `git clone https://git.pyro.monster/pyro/ferrisfind.git`
cd into the repo and build with cargo: cd into the repo and run the install script:
`cd ferrisfind` `cd ferrisfind`
`cargo build --release` `./install.sh`
Your compiled binary will be in ferrisfind/target/release/ferrisfind.
Executable
+25
View File
@@ -0,0 +1,25 @@
#!/bin/bash
echo "compiling ferrisfind..."
if cargo build --release; then
echo "ferrisfind compiled successfully!"
if ldd ./target/release/ferrisfind | grep libtika | grep "not found"; then
echo "libtika not found in current sytem, copying from the build directory."
echo "this will need sudo privileges..."
if sudo cp ./target/release/build/extractous-*/out/libs/libtika_native.so /usr/lib64/; then
echo "library copied successfully!"
else
echo "error copying libary!"
exit
fi
fi
echo "copying binary to your path..."
echo "this will require sudo privileges"
if sudo cp ./target/release/ferrisfind /usr/bin; then
echo "install complete!"
else
echo "error copying binary!"
fi
else
echo "error compiling biary! make sure rust and cargo are installed and setup!"
fi
+245
View File
@@ -0,0 +1,245 @@
use clap::Parser;
use extractous::Extractor;
use rayon::iter::IntoParallelRefMutIterator;
use rayon::{ThreadPoolBuilder, iter::ParallelIterator};
use reqwest::blocking::get;
use serpapi::serpapi::Client;
use std::fs::File;
use std::io::Write;
use std::{collections::HashMap, fs::create_dir_all, path::PathBuf};
use tokio;
use urlencoding::decode;
#[derive(Parser, Debug)]
#[command(
version,
about,
long_about = "Public file finder and metadata extractor"
)]
struct Args {
///company name
#[arg(short, long)]
name: String,
///serpapi key
#[arg(short, long)]
key: String,
///number of pages of results to get (note each page is a credit on your sperapi key, defaults to 10)
#[arg(short, long)]
pages: Option<usize>,
///the number of threads to use, will default to the rayon default
#[arg(short, long)]
threads: Option<usize>,
///folder to download found files into defaults to ./found_files
#[arg(short, long)]
download_path: Option<PathBuf>,
///save metadata as csv
#[arg(short, long)]
csv: Option<PathBuf>,
///save metadata as csv
#[arg(short, long)]
markdown: Option<PathBuf>,
}
struct FoundFile {
filename: String,
url: String,
filepath: PathBuf,
downloaded: Option<bool>,
parsed: Option<bool>,
metadata: HashMap<String, String>,
}
impl FoundFile {
fn download(&mut self) {
println!("downloading {}...", self.filename);
if let Ok(res) = get(&self.url) {
if res.status().is_success() {
if let Ok(data) = res.bytes() {
if let Ok(mut local_file) = File::create(&self.filepath) {
if let Err(e) = local_file.write(&data) {
eprintln!("Error downloading file! {e}");
} else {
self.downloaded = Some(true);
println!("{} Downloaded!", self.filename);
return;
}
}
}
}
}
self.downloaded = Some(false);
}
fn parse(&mut self) {
println!("parsing {}", self.filename);
let extractor = Extractor::new();
if let Ok((_content, metadata)) =
extractor.extract_file(&self.filepath.display().to_string())
{
if !metadata.is_empty() {
for (key, value) in metadata {
self.metadata.insert(key, value.join(", "));
}
}
}
self.parsed = Some(true);
println!("{} parsed!", self.filename);
}
}
#[tokio::main]
async fn main() {
println!("welcome to ferrisfind!");
let args = Args::parse();
if let Some(threads) = args.threads {
ThreadPoolBuilder::new()
.num_threads(threads)
.build_global()
.unwrap();
}
let mut pages = Vec::new();
if let Some(given_pages) = args.pages {
for page in 0..given_pages {
pages.push(page + 10);
}
} else {
for page in 0..10 {
pages.push(page + 10);
}
}
let mut download_path = PathBuf::from("found_files");
if let Some(path) = args.download_path {
download_path = path;
}
if let Err(e) = create_dir_all(&download_path) {
println!("error creating download files directory!\n{e}");
}
let mut settings = HashMap::new();
settings.insert("api_key".to_string(), args.key);
settings.insert("engine".to_string(), "google".to_string());
let client = Client::new(settings).unwrap();
let mut params = HashMap::new();
params.insert("q".to_string(), format!("Filetype: PDF \"{}\"", args.name));
params.insert("h1".to_string(), "en".to_string());
params.insert("g1".to_string(), "us".to_string());
params.insert("google_domain".to_string(), "google.com".to_string());
params.insert("start".to_string(), "0".to_string());
let mut files = Vec::new();
println!("beginning search");
for page in pages {
params.insert("start".to_string(), page.to_string());
if let Ok(results) = client.search(params.clone()).await {
if let Some(arr) = results["organic_results"].as_array() {
for item in arr {
if let Some(link) = item["link"].as_str() {
if link.contains(".pdf") {
let link_vec: Vec<&str> = link.split("/").collect();
let link = link.to_string();
if let Some(filename) = link_vec.last() {
let encoded_filename = filename.to_string();
let filename = decode(&encoded_filename).unwrap().to_string();
let mut filepath = download_path.clone();
filepath.push(&filename);
let new_file = FoundFile {
filename: filename,
filepath,
url: link,
downloaded: None,
parsed: None,
metadata: HashMap::new(),
};
files.push(new_file);
}
}
}
}
}
}
}
println!("Search finished!");
println!("downloding files...");
files.par_iter_mut().for_each(|file| {
file.download();
if file.downloaded == Some(true) {
file.parse();
}
});
for file in &files {
if file.parsed == Some(true) {
println!("_______________________________________________________________");
println!("# {}", file.filename);
println!("| Type | Data |");
println!("| ---- | ---- |");
println!("| filename | {} |", file.filename);
println!("| url | {} |", file.url);
for key in file.metadata.keys() {
println!("| {} | {} |", key, file.metadata.get(key).unwrap());
}
println!("________________________________________________________________");
}
}
if let Some(csv_path) = args.csv {
let mut columns = Vec::new();
let mut rows = Vec::new();
columns.push(String::from("file_name"));
columns.push(String::from("url"));
for file in &files {
if file.parsed == Some(true) {
let mut row = vec![file.filename.clone(), file.url.clone()];
for key in file.metadata.keys() {
if !columns.contains(key) {
columns.push(key.clone());
}
}
for key in &columns {
if let Some(data) = file.metadata.get(key) {
row.push(data.clone());
} else {
row.push(String::new());
}
}
rows.push(row.join(","));
}
}
if let Ok(mut csv_file) = File::create(csv_path) {
let mut out_string = columns.join(",");
out_string.push('\n');
out_string.push_str(&rows.join("\n"));
if let Err(e) = csv_file.write(out_string.as_bytes()) {
eprintln!("error writing csv file {e}");
}
}
}
if let Some(mdpath) = args.markdown {
let mut out_string = String::new();
for file in &files {
if file.parsed == Some(true) {
out_string
.push_str("_______________________________________________________________\n");
out_string.push_str(&format!("# {}\n", file.filename));
out_string.push_str("| Type | Data |\n");
out_string.push_str("| ---- | ---- |\n");
out_string.push_str(&format!("| filename | {} |\n", file.filename));
out_string.push_str(&format!("| url | {} |\n", file.url));
for key in file.metadata.keys() {
out_string.push_str(&format!(
"| {} | {} |\n",
key,
file.metadata.get(key).unwrap()
));
}
out_string
.push_str("________________________________________________________________\n");
}
}
if let Ok(mut md) = File::create(mdpath) {
md.write(out_string.as_bytes()).unwrap();
}
}
}