Draft reddit

medialab · Dec 5, 2024 · 6cba1ee · 6cba1ee
1 parent 0333ccd
commit 6cba1ee
Show file tree

Hide file tree

Showing 5 changed files with 230 additions and 0 deletions.
diff --git a/minet/cli/commands.py b/minet/cli/commands.py
@@ -14,6 +14,7 @@
 from minet.cli.hyphe import HYPHE_COMMAND
 from minet.cli.instagram import INSTAGRAM_COMMAND
 from minet.cli.mediacloud import MEDIACLOUD_COMMAND
+from minet.cli.reddit import REDDIT_COMMAND
 from minet.cli.telegram import TELEGRAM_COMMAND
 from minet.cli.tiktok import TIKTOK_COMMAND
 from minet.cli.twitter import TWITTER_COMMAND
@@ -42,6 +43,7 @@
     HYPHE_COMMAND,
     INSTAGRAM_COMMAND,
     MEDIACLOUD_COMMAND,
+    REDDIT_COMMAND,
     TELEGRAM_COMMAND,
     TIKTOK_COMMAND,
     TWITTER_COMMAND,

diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
@@ -0,0 +1,49 @@
+# =============================================================================
+# Minet Reddit CLI Action
+# =============================================================================
+#
+# Logic of the `rd` action.
+#
+from casanova import RowCountResumer
+
+from minet.cli.argparse import command, ConfigAction
+
+REDDIT_POSTS_SUBCOMMAND = command(
+    "posts",
+    "minet.cli.reddit.posts",
+    title="Minet Reddit Posts Command",
+    description="""
+        Retrieve reddit posts from a subreddit link.
+    """,
+    epilog="""
+        Example:
+
+        . Searching posts from the subreddit r/france:
+            $ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv
+    """,
+    variadic_input= {
+        "dummy_column": "post",
+        "item_label": "post url, post shortcode or post id",
+        "item_label_plural": "post urls, post shortcodes or post ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of posts to retrieve.",
+            "type": int,
+        }
+    ],
+)
+
+REDDIT_COMMAND = command(
+    "reddit",
+    "minet.cli.reddit",
+    "Minet Reddit Command",
+    aliases=["rd"],
+    description="""
+        Collect data from Reddit.
+    """,
+    subcommands=[
+        REDDIT_POSTS_SUBCOMMAND,
+    ],
+)
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
@@ -0,0 +1,44 @@
+# =============================================================================
+# Minet Reddit Posts CLI Action
+# =============================================================================
+#
+# Logic of the `rd posts` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditPost
+
+
+
+@with_enricher_and_loading_bar(
+    headers={"post_url"},
+    title="Scraping posts",
+    unit="groups",
+    nested=True,
+    sub_unit="posts",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    posts = scraper.get_posts_urls(url, cli_args.number)
+                else:
+                    posts = scraper.get_posts_urls(url)
+            except :
+                loading_bar.print(
+                    "problème"
+                )
+                continue
+
+            list_posts = []
+            for post in posts:
+                list_posts.append({post})
+
+            for post in list_posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
@@ -0,0 +1,103 @@
+from minet.web import request, create_pool_manager
+from math import ceil
+from ural import get_domain_name, urlpathsplit, is_url
+from time import sleep
+from minet.reddit.types import RedditPost
+import json
+from ebbe import getpath
+from collections import deque
+from urllib.parse import urljoin
+import csv
+import re
+import sys
+import os
+
+def get_old_url(url):
+    domain = get_domain_name(url)
+    path = urlpathsplit(url)
+    return f"https://old.{domain}/" + "/".join(path) + "/"
+
+
+def get_new_url(url):
+    domain = get_domain_name(url)
+    path = urlpathsplit(url)
+    return f"https://www.{domain}/" + "/".join(path) + "/"
+
+def reddit_request(url, pool_manager):
+    sleep(1)
+    response = request(url, pool_manager=pool_manager)
+    remaining_requests = float(response.headers["x-ratelimit-remaining"])
+    if remaining_requests == 1:
+        time_remaining = int(response.headers["x-ratelimit-reset"])
+        print(f"Time before next request : {time_remaining}s")
+        sleep(time_remaining)
+        return reddit_request(url)
+    if response.status == 429:
+        return reddit_request(url)
+    return response
+
+
+class RedditScraper(object):
+    def __init__(self):
+        self.pool_manager = create_pool_manager()
+
+    def get_posts_urls(self, url, nb_post = 25):
+        dir_name = urlpathsplit(url)[1]
+        try:
+            os.mkdir(dir_name)
+        except FileExistsError:
+            pass
+        except PermissionError:
+            print(f"Permission denied: Unable to create '{dir_name}'.")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+        list_posts = set()
+        nb_pages = ceil(int(nb_post) / 25)
+        old_url = get_old_url(url)
+        n_crawled = 0
+        for _ in range(nb_pages):
+            if n_crawled == int(nb_post):
+                break
+            response = reddit_request(old_url, self.pool_manager)
+            soup = response.soup()
+            list_buttons = soup.select("ul[class='flat-list buttons']")
+            for link in list_buttons:
+                if n_crawled == int(nb_post):
+                    break
+                if len(link.scrape("span[class='promoted-span']")) == 0:
+                    list_posts.update(link.scrape("a[class^='bylink comments']", "href"))
+                    n_crawled += 1
+            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
+        return list(list_posts)
+
+
+    def get_posts(self, url, nb_post):
+        posts = []
+        list_posts_url = self.get_posts_urls(self, url, nb_post)
+        for url in list_posts_url:
+            response = reddit_request(url, self.pool_manager)
+            if response.url == 429:
+                print(response.headers)
+                print(response.end_url)
+            soup = response.soup()
+            title = soup.force_select_one("a[class^='title']").get_text()
+            upvote = soup.force_select_one("div[class='score'] span").get_text()
+            author = soup.scrape_one("a[class^='author']", "href")
+            published_date = soup.scrape_one("div[class='date'] time", "datetime")
+            link = soup.scrape_one("a[class^='title']", "href")
+            if urlpathsplit(link) == urlpathsplit(url):
+                link = None
+            author_text = soup.scrape_one(
+                "div[id='siteTable'] div[class^='usertext-body'] div p"
+            )
+            post = RedditPost(
+                title=title,
+                url=url,
+                author=author,
+                author_text=author_text,
+                upvote=upvote,
+                published_date=published_date,
+                link=link,
+            )
+            posts.append(post)
+        return posts
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
@@ -0,0 +1,32 @@
+from typing import List, Optional, Dict, Tuple, Iterable
+from datetime import datetime
+
+from dataclasses import dataclass
+from casanova import TabularRecord
+from ebbe import getpath
+
+
+@dataclass
+class RedditPost(TabularRecord):
+    title: str
+    url: str
+    author: str
+    author_text: str
+    upvote: str
+    published_date: str
+    link: Optional[str]
+
+
+@dataclass
+class RedditComment(TabularRecord):
+    # url: str
+    # author: str
+    id: str
+    parent: str
+    # points: Optional[str]
+    # published_date: str
+    comment: str
+
+
+
+