Skip to content

Commit

Permalink
Draft reddit
Browse files Browse the repository at this point in the history
  • Loading branch information
jpontoire committed Dec 5, 2024
1 parent 0333ccd commit 6cba1ee
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 0 deletions.
2 changes: 2 additions & 0 deletions minet/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from minet.cli.hyphe import HYPHE_COMMAND
from minet.cli.instagram import INSTAGRAM_COMMAND
from minet.cli.mediacloud import MEDIACLOUD_COMMAND
from minet.cli.reddit import REDDIT_COMMAND
from minet.cli.telegram import TELEGRAM_COMMAND
from minet.cli.tiktok import TIKTOK_COMMAND
from minet.cli.twitter import TWITTER_COMMAND
Expand Down Expand Up @@ -42,6 +43,7 @@
HYPHE_COMMAND,
INSTAGRAM_COMMAND,
MEDIACLOUD_COMMAND,
REDDIT_COMMAND,
TELEGRAM_COMMAND,
TIKTOK_COMMAND,
TWITTER_COMMAND,
Expand Down
49 changes: 49 additions & 0 deletions minet/cli/reddit/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# =============================================================================
# Minet Reddit CLI Action
# =============================================================================
#
# Logic of the `rd` action.
#
from casanova import RowCountResumer

from minet.cli.argparse import command, ConfigAction

REDDIT_POSTS_SUBCOMMAND = command(
"posts",
"minet.cli.reddit.posts",
title="Minet Reddit Posts Command",
description="""
Retrieve reddit posts from a subreddit link.
""",
epilog="""
Example:
. Searching posts from the subreddit r/france:
$ minet reddit posts https://www.reddit.com/r/france > r_france_posts.csv
""",
variadic_input= {
"dummy_column": "post",
"item_label": "post url, post shortcode or post id",
"item_label_plural": "post urls, post shortcodes or post ids",
},
arguments=[
{
"flags": ["-n", "--number"],
"help": "Number of posts to retrieve.",
"type": int,
}
],
)

REDDIT_COMMAND = command(
"reddit",
"minet.cli.reddit",
"Minet Reddit Command",
aliases=["rd"],
description="""
Collect data from Reddit.
""",
subcommands=[
REDDIT_POSTS_SUBCOMMAND,
],
)
44 changes: 44 additions & 0 deletions minet/cli/reddit/posts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# =============================================================================
# Minet Reddit Posts CLI Action
# =============================================================================
#
# Logic of the `rd posts` action.
#
from minet.cli.utils import with_enricher_and_loading_bar
from minet.reddit.scraper import RedditScraper
from minet.reddit.types import RedditPost



@with_enricher_and_loading_bar(
headers={"post_url"},
title="Scraping posts",
unit="groups",
nested=True,
sub_unit="posts",
)
def action(cli_args, enricher, loading_bar):
scraper = RedditScraper()

for i, row, url in enricher.enumerate_cells(
cli_args.column, with_rows=True, start=1
):
with loading_bar.step(url):
try:
if cli_args.number:
posts = scraper.get_posts_urls(url, cli_args.number)
else:
posts = scraper.get_posts_urls(url)
except :
loading_bar.print(
"problème"
)
continue

list_posts = []
for post in posts:
list_posts.append({post})

for post in list_posts:
loading_bar.nested_advance()
enricher.writerow(row, post)
103 changes: 103 additions & 0 deletions minet/reddit/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from minet.web import request, create_pool_manager
from math import ceil
from ural import get_domain_name, urlpathsplit, is_url
from time import sleep
from minet.reddit.types import RedditPost
import json
from ebbe import getpath
from collections import deque
from urllib.parse import urljoin
import csv
import re
import sys
import os

def get_old_url(url):
domain = get_domain_name(url)
path = urlpathsplit(url)
return f"https://old.{domain}/" + "/".join(path) + "/"


def get_new_url(url):
domain = get_domain_name(url)
path = urlpathsplit(url)
return f"https://www.{domain}/" + "/".join(path) + "/"

def reddit_request(url, pool_manager):
sleep(1)
response = request(url, pool_manager=pool_manager)
remaining_requests = float(response.headers["x-ratelimit-remaining"])
if remaining_requests == 1:
time_remaining = int(response.headers["x-ratelimit-reset"])
print(f"Time before next request : {time_remaining}s")
sleep(time_remaining)
return reddit_request(url)
if response.status == 429:
return reddit_request(url)
return response


class RedditScraper(object):
def __init__(self):
self.pool_manager = create_pool_manager()

def get_posts_urls(self, url, nb_post = 25):
dir_name = urlpathsplit(url)[1]
try:
os.mkdir(dir_name)
except FileExistsError:
pass
except PermissionError:
print(f"Permission denied: Unable to create '{dir_name}'.")
except Exception as e:
print(f"An error occurred: {e}")
list_posts = set()
nb_pages = ceil(int(nb_post) / 25)
old_url = get_old_url(url)
n_crawled = 0
for _ in range(nb_pages):
if n_crawled == int(nb_post):
break
response = reddit_request(old_url, self.pool_manager)
soup = response.soup()
list_buttons = soup.select("ul[class='flat-list buttons']")
for link in list_buttons:
if n_crawled == int(nb_post):
break
if len(link.scrape("span[class='promoted-span']")) == 0:
list_posts.update(link.scrape("a[class^='bylink comments']", "href"))
n_crawled += 1
old_url = soup.scrape("span[class='next-button'] a", "href")[0]
return list(list_posts)


def get_posts(self, url, nb_post):
posts = []
list_posts_url = self.get_posts_urls(self, url, nb_post)
for url in list_posts_url:
response = reddit_request(url, self.pool_manager)
if response.url == 429:
print(response.headers)
print(response.end_url)
soup = response.soup()
title = soup.force_select_one("a[class^='title']").get_text()
upvote = soup.force_select_one("div[class='score'] span").get_text()
author = soup.scrape_one("a[class^='author']", "href")
published_date = soup.scrape_one("div[class='date'] time", "datetime")
link = soup.scrape_one("a[class^='title']", "href")
if urlpathsplit(link) == urlpathsplit(url):
link = None
author_text = soup.scrape_one(
"div[id='siteTable'] div[class^='usertext-body'] div p"
)
post = RedditPost(
title=title,
url=url,
author=author,
author_text=author_text,
upvote=upvote,
published_date=published_date,
link=link,
)
posts.append(post)
return posts
32 changes: 32 additions & 0 deletions minet/reddit/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from typing import List, Optional, Dict, Tuple, Iterable
from datetime import datetime

from dataclasses import dataclass
from casanova import TabularRecord
from ebbe import getpath


@dataclass
class RedditPost(TabularRecord):
title: str
url: str
author: str
author_text: str
upvote: str
published_date: str
link: Optional[str]


@dataclass
class RedditComment(TabularRecord):
# url: str
# author: str
id: str
parent: str
# points: Optional[str]
# published_date: str
comment: str




0 comments on commit 6cba1ee

Please sign in to comment.