Added the gulagcleaner code

YM162 · Dec 23, 2023 · 40dcffd · 40dcffd
1 parent 83dc88e
commit 40dcffd
Show file tree

Hide file tree

Showing 21 changed files with 4,604 additions and 43 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [workspace]
-
+resolver = "2"
 members = [
-    "gulagcleaner",
+    "gulagcleaner_rs",
     "gulagcleaner_python",
     "gulagcleaner_wasm"
 ]
diff --git a/gulagcleaner/src/lib.rs b/gulagcleaner/src/lib.rs
diff --git a/gulagcleaner_python/Cargo.toml b/gulagcleaner_python/Cargo.toml
@@ -18,4 +18,4 @@ path = "rust/lib.rs"
 
 [dependencies]
 pyo3 = { version = "0.20.0", features = ["extension-module"] }
-gulagcleaner = { path = "../gulagcleaner" }
+gulagcleaner_rs = { path = "../gulagcleaner_rs" }
diff --git a/gulagcleaner_python/LICENSE b/gulagcleaner_python/LICENSE
diff --git a/gulagcleaner_python/README.md b/gulagcleaner_python/README.md
@@ -0,0 +1,71 @@
+
+# Gulag Cleaner
+
+
+[![Twitter](https://a11ybadges.com/badge?logo=twitter)](https://twitter.com/gulagcleaner)
+[![Instagram](https://a11ybadges.com/badge?logo=instagram)](https://www.instagram.com/gulagcleaner/)
+[![Ko-fi](https://a11ybadges.com/badge?logo=kofi)](https://ko-fi.com/L3L86VEX9)
+
+
+Gulag Cleaner is a tool designed to remove advertisements from PDFs, making it easier to read and navigate documents without being disrupted by unwanted ads.
+
+This tool does not just crop the ads out of the PDF, instead, we extract the original file without ads by manipulating the internal structure of the PDF, ensuring maximum quality.
+
+In addition to removing advertisements, Gulag Cleaner is also capable of extracting metadata, such as the author, subject, university, and more, from the file.
+
+# Web Version
+
+This tool can be used without installation directly from [our website](https://gulagcleaner.com) (in Spanish).
+
+[![Gulag Cleaner webpage](https://raw.githubusercontent.com/YM162/gulagcleaner/main/assets/web_mockup.png)](https://gulagcleaner.com)
+
+# Installation
+
+To install Gulag Cleaner, please [download](https://www.python.org/downloads/) and [install](https://wiki.python.org/moin/BeginnersGuide/Download) Python and then run the following command in your terminal:
+```
+pip install gulagcleaner
+```
+
+# Usage
+
+Gulag Cleaner can be used through both a Command Line Interface (CLI) and in your code.
+
+## Command Line Interface
+
+To use Gulag Cleaner through the CLI, simply run the following command, replacing `<filename>` with the name of one or more PDF files or folders containing PDF:
+
+```
+gulagcleaner [-r] [-s] [-h] [-v] <filename>...
+```
+
+## Options
+
+Gulag Cleaner provides several options for its usage:
+
+> * '-r': Replace the original file with the cleaned version.
+> * '-s': Do not show metadata about cleaned files.
+> * '-h': Display the help message, providing information on how to use Gulag Cleaner.
+> * '-v': Display the current version of Gulag Cleaner.
+
+## Code
+
+To use Gulag Cleaner in your code, you can use the following code snippet:
+
+```python
+from gulagcleaner.extract import clean_pdf
+
+return_msg = clean_pdf("file.pdf")
+```
+
+# License
+Gulag Cleaner is distributed under the GPL-3 license, which means it's open-source and free to use.
+
+# Contributing
+We're always looking for ways to improve Gulag Cleaner, and we welcome contributions from the community. If you have ideas for improvements or bug fixes, please feel free to submit a pull request.
+
+## TODO
+If you want to help, these are the top priorities right now:
+
+* Revamp the argument parsing. We should use some parsing library to allow for short "-v" and long "--version" arguments. Idealy it should support parameters for each argument.
+
+* Add the "Naive" cleaning method. This method is just a fallback that crops the Ads by zooming in and moving the MediaBox. This is not ideal, but there will always be edge cases not covered in the other methods and doing this better than giving an error.
diff --git a/gulagcleaner_python/pyproject.toml b/gulagcleaner_python/pyproject.toml
@@ -8,11 +8,11 @@ version = "0.0.1"
 description = "Ad removal tool for PDFs."
 authors = [
   {name = "YM162", email = "[email protected]"}]
-readme  = "../README.md"
+readme  = "README.md"
 dependencies = [
   "pikepdf>=5.1.2","pdfminer.six>=20220524"
 ]
-license = {file = "../LICENSE"}
+license = {file = "LICENSE"}
 classifiers = ["Programming Language :: Python :: 3",
                 "License :: OSI Approved :: MIT License",
                 "Operating System :: OS Independent"]

diff --git a/gulagcleaner_python/python/gulagcleaner/__pycache__/clean.cpython-38.pyc b/gulagcleaner_python/python/gulagcleaner/__pycache__/clean.cpython-38.pyc
diff --git a/gulagcleaner_python/python/gulagcleaner/__pycache__/command_line.cpython-38.pyc b/gulagcleaner_python/python/gulagcleaner/__pycache__/command_line.cpython-38.pyc
diff --git a/gulagcleaner_python/python/gulagcleaner/__pycache__/decrypt.cpython-38.pyc b/gulagcleaner_python/python/gulagcleaner/__pycache__/decrypt.cpython-38.pyc
diff --git a/gulagcleaner_python/python/gulagcleaner/clean.py b/gulagcleaner_python/python/gulagcleaner/clean.py
@@ -1,14 +1,30 @@
 from ._lib import clean_pdf  # export public parts of the binary extension
 
-#Here there should only be a function clean_pdf(pdf_path, output_path, force_naive) 
-#that calls the rust function and then saves the pdf in the given output_path.
-#It should return a dictionary with the following keys:
-#     Returns:
-#         return_msg (dict): A dictionary with the following keys:
-#             success (bool): Indicates whether the de-embedding process was successful.
-#             return_path (str): The path to the de-embedded file if successful.
-#             error (str): An error description if the process was unsuccessful.
-#     """
-
-def clean_pdf(pdf_path, output_path, force_naive):
-    return clean_pdf(10,6)
+def clean_pdf_path(pdf_path, output_path, force_naive):
+    """
+    Cleans the ads from the PDF file in a given path and saves it in another path.
+    Args:
+        pdf_path (str): The path to the pdf file.
+        output_path (str): The path to save the cleaned pdf file.
+        force_naive (bool): Whether to force the naive cleaning method.
+    Returns:
+        return_msg (dict): A dictionary with the following keys:
+            success (bool): Indicates whether the de-embedding process was successful.
+            return_path (str): The path to the cleaned file if successful.
+            method (str): The method used to clean the file.
+            error (str): An error description if the process was unsuccessful.
+    """
+    try:
+        with open(pdf_path, "rb") as f:
+            pdf = f.read()
+            cleaned_pdf = clean_pdf(pdf, force_naive)
+            with open(output_path, "wb") as f:
+                method = cleaned_pdf[len(cleaned_pdf)-1]
+                cleaned_pdf = cleaned_pdf[0:len(cleaned_pdf)-1]
+                f.write(bytes(cleaned_pdf))
+            return {"success": True, 
+                    "return_path": output_path, 
+                    "method": method,
+                    "error": ""}
+    except Exception as e:
+        return {"success": False, "return_path": "","method":"", "error": str(e)}
diff --git a/gulagcleaner_python/python/gulagcleaner/command_line.py b/gulagcleaner_python/python/gulagcleaner/command_line.py
@@ -1,4 +1,4 @@
-from gulagcleaner.clean import clean_pdf
+from gulagcleaner.clean import clean_pdf_path
 from gulagcleaner.decrypt import decrypt_pdf
 from gulagcleaner.metadata import extract_metadata
 from os.path import exists, isdir, join
@@ -109,13 +109,13 @@ def main():
                 print("Failed to extract metadata:", e)
 
         # Call the cleaning function
-        return_msg = clean_pdf(pdf_path, output_path, force_naive)
+        return_msg = clean_pdf_path(pdf_path, output_path, force_naive)
         remove(pdf_path)
-        if return_msg["Success"]:
+        if return_msg["success"]:
             print("Cleaning successful. File saved in " + 
                   return_msg["return_path"])
         else:
-            print("Error cleaning " + pdf_path + ": " + return_msg["Error"])
+            print("Error cleaning " + pdf_path + ": " + return_msg["error"])
 
 if __name__ == "__main__":
     print('Call from the "gulagcleaner" command.')
diff --git a/gulagcleaner_python/rust/lib.rs b/gulagcleaner_python/rust/lib.rs
@@ -1,10 +1,9 @@
 use pyo3::prelude::*;
-use gulagcleaner;
 
 /// I only need to replace the sum_as_string function with the clean_pdf function, and the arguments to be u8 vectors. Maybe I need to change some types here to pure rust.
 #[pyfunction]
-pub fn clean_pdf(a: usize, b: usize) -> PyResult<String> {
-    Ok(gulagcleaner::clean_pdf(a, b))
+pub fn clean_pdf(data: Vec<u8>,force_naive: u8) -> PyResult<Vec<u8>> {
+    Ok(gulagcleaner_rs::clean_pdf(data, force_naive))
 }
 
 /// I only need to replace the sum_as_string function with the clean_pdf function

diff --git a/gulagcleaner/Cargo.toml → gulagcleaner_rs/Cargo.toml b/gulagcleaner/Cargo.toml → gulagcleaner_rs/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "gulagcleaner"
+name = "gulagcleaner_rs"
 version = "0.10.0"
 edition = "2021"
 authors = ["YM162 <[email protected]>"]
@@ -13,8 +13,7 @@ keywords = ["wuolah", "pdf", "ads", "advertisments", "cleaner", "gulagcleaner"]
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [lib]
-name = "gulagcleaner"
-crate-type = ["lib"]
+name = "gulagcleaner_rs"
 
 [dependencies]
 flate2 = "1.0.27"