added install script in order to make sure libtika is setup correctly.
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "compiling ferrisfind..."
|
||||
if cargo build --release; then
|
||||
echo "ferrisfind compiled successfully!"
|
||||
if ldd ./target/release/hyphae_cmd | grep libtika | grep "not found"; then
|
||||
echo "libtika not found in current sytem, copying from the build directory."
|
||||
echo "this will need sudo privileges..."
|
||||
if sudo cp ./target/release/build/extractous-*/out/libs/libtika_native.so /usr/lib64/; then
|
||||
echo "library copied successfully!"
|
||||
else
|
||||
echo "error copying libary!"
|
||||
exit
|
||||
fi
|
||||
fi
|
||||
echo "copying binary to your path..."
|
||||
echo "this will require sudo privileges"
|
||||
if sudo cp ./target/release/hyphae_cmd /usr/bin; then
|
||||
echo "install complete!"
|
||||
else
|
||||
echo "error copying binary!"
|
||||
fi
|
||||
else
|
||||
echo "error compiling biary! make sure rust and cargo are installed and setup!"
|
||||
fi
|
||||
+245
@@ -0,0 +1,245 @@
|
||||
use clap::Parser;
|
||||
use extractous::Extractor;
|
||||
use rayon::iter::IntoParallelRefMutIterator;
|
||||
use rayon::{ThreadPoolBuilder, iter::ParallelIterator};
|
||||
use reqwest::blocking::get;
|
||||
use serpapi::serpapi::Client;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::{collections::HashMap, fs::create_dir_all, path::PathBuf};
|
||||
use tokio;
|
||||
use urlencoding::decode;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(
|
||||
version,
|
||||
about,
|
||||
long_about = "Public file finder and metadata extractor"
|
||||
)]
|
||||
struct Args {
|
||||
///company name
|
||||
#[arg(short, long)]
|
||||
name: String,
|
||||
|
||||
///serpapi key
|
||||
#[arg(short, long)]
|
||||
key: String,
|
||||
|
||||
///number of pages of results to get (note each page is a credit on your sperapi key, defaults to 10)
|
||||
#[arg(short, long)]
|
||||
pages: Option<usize>,
|
||||
|
||||
///the number of threads to use, will default to the rayon default
|
||||
#[arg(short, long)]
|
||||
threads: Option<usize>,
|
||||
|
||||
///folder to download found files into defaults to ./found_files
|
||||
#[arg(short, long)]
|
||||
download_path: Option<PathBuf>,
|
||||
|
||||
///save metadata as csv
|
||||
#[arg(short, long)]
|
||||
csv: Option<PathBuf>,
|
||||
|
||||
///save metadata as csv
|
||||
#[arg(short, long)]
|
||||
markdown: Option<PathBuf>,
|
||||
}
|
||||
|
||||
struct FoundFile {
|
||||
filename: String,
|
||||
url: String,
|
||||
filepath: PathBuf,
|
||||
downloaded: Option<bool>,
|
||||
parsed: Option<bool>,
|
||||
metadata: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl FoundFile {
|
||||
fn download(&mut self) {
|
||||
println!("downloading {}...", self.filename);
|
||||
if let Ok(res) = get(&self.url) {
|
||||
if res.status().is_success() {
|
||||
if let Ok(data) = res.bytes() {
|
||||
if let Ok(mut local_file) = File::create(&self.filepath) {
|
||||
if let Err(e) = local_file.write(&data) {
|
||||
eprintln!("Error downloading file! {e}");
|
||||
} else {
|
||||
self.downloaded = Some(true);
|
||||
println!("{} Downloaded!", self.filename);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
self.downloaded = Some(false);
|
||||
}
|
||||
|
||||
fn parse(&mut self) {
|
||||
println!("parsing {}", self.filename);
|
||||
let extractor = Extractor::new();
|
||||
if let Ok((_content, metadata)) =
|
||||
extractor.extract_file(&self.filepath.display().to_string())
|
||||
{
|
||||
if !metadata.is_empty() {
|
||||
for (key, value) in metadata {
|
||||
self.metadata.insert(key, value.join(", "));
|
||||
}
|
||||
}
|
||||
}
|
||||
self.parsed = Some(true);
|
||||
println!("{} parsed!", self.filename);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
println!("welcome to ferrisfind!");
|
||||
let args = Args::parse();
|
||||
if let Some(threads) = args.threads {
|
||||
ThreadPoolBuilder::new()
|
||||
.num_threads(threads)
|
||||
.build_global()
|
||||
.unwrap();
|
||||
}
|
||||
let mut pages = Vec::new();
|
||||
if let Some(given_pages) = args.pages {
|
||||
for page in 0..given_pages {
|
||||
pages.push(page + 10);
|
||||
}
|
||||
} else {
|
||||
for page in 0..10 {
|
||||
pages.push(page + 10);
|
||||
}
|
||||
}
|
||||
let mut download_path = PathBuf::from("found_files");
|
||||
if let Some(path) = args.download_path {
|
||||
download_path = path;
|
||||
}
|
||||
if let Err(e) = create_dir_all(&download_path) {
|
||||
println!("error creating download files directory!\n{e}");
|
||||
}
|
||||
let mut settings = HashMap::new();
|
||||
settings.insert("api_key".to_string(), args.key);
|
||||
settings.insert("engine".to_string(), "google".to_string());
|
||||
let client = Client::new(settings).unwrap();
|
||||
let mut params = HashMap::new();
|
||||
params.insert("q".to_string(), format!("Filetype: PDF \"{}\"", args.name));
|
||||
params.insert("h1".to_string(), "en".to_string());
|
||||
params.insert("g1".to_string(), "us".to_string());
|
||||
params.insert("google_domain".to_string(), "google.com".to_string());
|
||||
params.insert("start".to_string(), "0".to_string());
|
||||
let mut files = Vec::new();
|
||||
println!("beginning search");
|
||||
for page in pages {
|
||||
params.insert("start".to_string(), page.to_string());
|
||||
if let Ok(results) = client.search(params.clone()).await {
|
||||
if let Some(arr) = results["organic_results"].as_array() {
|
||||
for item in arr {
|
||||
if let Some(link) = item["link"].as_str() {
|
||||
if link.contains(".pdf") {
|
||||
let link_vec: Vec<&str> = link.split("/").collect();
|
||||
let link = link.to_string();
|
||||
if let Some(filename) = link_vec.last() {
|
||||
let encoded_filename = filename.to_string();
|
||||
let filename = decode(&encoded_filename).unwrap().to_string();
|
||||
let mut filepath = download_path.clone();
|
||||
filepath.push(&filename);
|
||||
let new_file = FoundFile {
|
||||
filename: filename,
|
||||
filepath,
|
||||
url: link,
|
||||
downloaded: None,
|
||||
parsed: None,
|
||||
metadata: HashMap::new(),
|
||||
};
|
||||
files.push(new_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
println!("Search finished!");
|
||||
println!("downloding files...");
|
||||
files.par_iter_mut().for_each(|file| {
|
||||
file.download();
|
||||
if file.downloaded == Some(true) {
|
||||
file.parse();
|
||||
}
|
||||
});
|
||||
for file in &files {
|
||||
if file.parsed == Some(true) {
|
||||
println!("_______________________________________________________________");
|
||||
println!("# {}", file.filename);
|
||||
println!("| Type | Data |");
|
||||
println!("| ---- | ---- |");
|
||||
println!("| filename | {} |", file.filename);
|
||||
println!("| url | {} |", file.url);
|
||||
for key in file.metadata.keys() {
|
||||
println!("| {} | {} |", key, file.metadata.get(key).unwrap());
|
||||
}
|
||||
println!("________________________________________________________________");
|
||||
}
|
||||
}
|
||||
if let Some(csv_path) = args.csv {
|
||||
let mut columns = Vec::new();
|
||||
let mut rows = Vec::new();
|
||||
columns.push(String::from("file_name"));
|
||||
columns.push(String::from("url"));
|
||||
for file in &files {
|
||||
if file.parsed == Some(true) {
|
||||
let mut row = vec![file.filename.clone(), file.url.clone()];
|
||||
for key in file.metadata.keys() {
|
||||
if !columns.contains(key) {
|
||||
columns.push(key.clone());
|
||||
}
|
||||
}
|
||||
for key in &columns {
|
||||
if let Some(data) = file.metadata.get(key) {
|
||||
row.push(data.clone());
|
||||
} else {
|
||||
row.push(String::new());
|
||||
}
|
||||
}
|
||||
rows.push(row.join(","));
|
||||
}
|
||||
}
|
||||
if let Ok(mut csv_file) = File::create(csv_path) {
|
||||
let mut out_string = columns.join(",");
|
||||
out_string.push('\n');
|
||||
out_string.push_str(&rows.join("\n"));
|
||||
if let Err(e) = csv_file.write(out_string.as_bytes()) {
|
||||
eprintln!("error writing csv file {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(mdpath) = args.markdown {
|
||||
let mut out_string = String::new();
|
||||
for file in &files {
|
||||
if file.parsed == Some(true) {
|
||||
out_string
|
||||
.push_str("_______________________________________________________________\n");
|
||||
out_string.push_str(&format!("# {}\n", file.filename));
|
||||
out_string.push_str("| Type | Data |\n");
|
||||
out_string.push_str("| ---- | ---- |\n");
|
||||
out_string.push_str(&format!("| filename | {} |\n", file.filename));
|
||||
out_string.push_str(&format!("| url | {} |\n", file.url));
|
||||
for key in file.metadata.keys() {
|
||||
out_string.push_str(&format!(
|
||||
"| {} | {} |\n",
|
||||
key,
|
||||
file.metadata.get(key).unwrap()
|
||||
));
|
||||
}
|
||||
out_string
|
||||
.push_str("________________________________________________________________\n");
|
||||
}
|
||||
}
|
||||
if let Ok(mut md) = File::create(mdpath) {
|
||||
md.write(out_string.as_bytes()).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user