Skip to content

Commit

Permalink
Split hash cache functionality into multiple classes
Browse files Browse the repository at this point in the history
  • Loading branch information
haykam821 committed Feb 24, 2024
1 parent 8f0ce03 commit d52bf68
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 55 deletions.
58 changes: 58 additions & 0 deletions src/hash/file-hash-cache.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import { HashCache } from "./hash-cache";
import fse from "fs-extra";
import { hashes as log } from "../util/log";
import { resolve } from "node:path";

/**
* A hash cache that is read from and stored in a JSON file.
*/
export class FileHashCache extends HashCache {
private readonly cache = new Map<string, string>();
private readonly path: string;

constructor(path: string) {
super();

this.path = resolve(path);
}

async read(): Promise<void> {
try {
const json: Record<string, string> = await fse.readJSON(this.path);
log("loaded hashes from %s", this.path);

for (const [name, hash] of Object.entries(json)) {
this.cache.set(name, hash);
}
} catch (error) {
if (error.code === "ENOENT") {
await fse.writeJSON(this.path, {});
log("created hashes file at %s", this.path);
} else {
log("failed to load hashes file at %s", this.path);
}
}
}

async write(): Promise<void> {
try {
await fse.writeJSON(this.path, Object.fromEntries(this.cache));
log("saved new hashes to %s", this.path);
} catch {
log("failed to save new hashes");
}
}

isCached(name: string, hash: string): boolean {
if (this.cache.get(name) === hash) {
log("skipping %s as its hash has not changed", name);
return true;
}

return false;
}

add(name: string, hash: string): void {
this.cache.set(name, hash);
}
}
15 changes: 15 additions & 0 deletions src/hash/hash-cache.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* A map representing the last hash saved for each bundle.
*/
export abstract class HashCache {
read(): void | Promise<void> {
return;
}

write(): void | Promise<void> {
return;
}

abstract isCached(name: string, hash: string): boolean;
abstract add(name: string, hash: string): void;
}
19 changes: 19 additions & 0 deletions src/hash/never-hash-cache.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import { HashCache } from "./hash-cache";
import { hashes as log } from "../util/log";

/**
* A hash cache that does not store or check hashes.
*/
export class NeverHashCache extends HashCache {
read(): void {
log("not checking bundle hashes");
}

isCached(): boolean {
return false;
}

add(): void {
return;
}
}
19 changes: 6 additions & 13 deletions src/start.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import { dumping as dumpingLog, hashes as hashesLog, log } from "./util/log";
import { dumping as dumpingLog, log } from "./util/log";
import puppeteer, { SetCookie } from "puppeteer";

import { FileHashCache } from "./hash/file-hash-cache";
import { NeverHashCache } from "./hash/never-hash-cache";
import { RedditDataminerOptions } from "./util/options";
import dumpScripts from "./util/dump-scripts";
import fse from "fs-extra";
import getHashes from "./util/get-hashes";
import getRuntimeScripts from "./util/get-runtime-scripts";
import getScripts from "./util/get-scripts";
import getToken from "./util/get-token";
Expand All @@ -31,7 +32,8 @@ export default async function start(args: RedditDataminerOptions): Promise<strin
await fse.ensureDir(args.path);
log("ensured output path exists");

const hashes = await getHashes(args.hashes);
const hashes = args.hashes ? new FileHashCache(args.hashes) : new NeverHashCache();
await hashes.read();

const browser = await puppeteer.launch({
args: args.sandbox ? [] : noSandboxArgs,
Expand Down Expand Up @@ -83,16 +85,7 @@ export default async function start(args: RedditDataminerOptions): Promise<strin
dumpingLog("failed to dump all scripts");
}

if (args.hashes) {
await fse.writeJSON(args.hashes, hashes)
.then(written => {
hashesLog("saved new hashes to %s", args.hashes);
return written;
})
.catch(() => {
hashesLog("failed to save new hashes");
});
}
await hashes.write();

// Clean up
log("cleaning up");
Expand Down
7 changes: 4 additions & 3 deletions src/util/dump-scripts.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { HashCache } from "../hash/hash-cache";
import { RedditDataminerOptions } from "./options";
import { ScriptInfo } from "../script-info";
import { addModuleSuffix } from "./module-suffix";
Expand All @@ -21,7 +22,7 @@ import { uaGot } from "./got";
* @param knownModules The modules that have already been archived.
* @returns The number of modules archived.
*/
async function dumpScript(script: ScriptInfo, transformersRun: Record<string, string>, args: RedditDataminerOptions, hashes: Record<string, string>, knownModules: Set<string>): Promise<number> {
async function dumpScript(script: ScriptInfo, transformersRun: Record<string, string>, args: RedditDataminerOptions, hashes: HashCache, knownModules: Set<string>): Promise<number> {
const url = script.getUrl();
const index = script.getIndex();

Expand Down Expand Up @@ -91,7 +92,7 @@ async function dumpScript(script: ScriptInfo, transformersRun: Record<string, st
moduleIndex += 1;
}

hashes[name] = hash;
hashes.add(name, hash);
return modules.size;
}

Expand All @@ -103,7 +104,7 @@ async function dumpScript(script: ScriptInfo, transformersRun: Record<string, st
* @param hashes The hashes for previously-saved scripts.
* @returns Whether dumping was attempted for all scripts.
*/
export default async function dumpScripts(scriptUrls: string[], transformersRun: Record<string, string>, args: RedditDataminerOptions, hashes: Record<string, string>): Promise<boolean> {
export default async function dumpScripts(scriptUrls: string[], transformersRun: Record<string, string>, args: RedditDataminerOptions, hashes: HashCache): Promise<boolean> {
const scriptInfos = scriptUrls.map((url, index) => {
return new ScriptInfo(url, index);
}).filter(script => {
Expand Down
27 changes: 0 additions & 27 deletions src/util/get-hashes.ts

This file was deleted.

12 changes: 4 additions & 8 deletions src/util/get-runtime-scripts.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { hashes as hashesLog, log } from "./log";

import { HashCache } from "../hash/hash-cache";
import getHashObjects from "./get-hash-objects";
import got from "got";
import { log } from "./log";

/**
* Gets scripts from the runtime script.
Expand All @@ -10,7 +10,7 @@ import got from "got";
* @param mapIndex The index of the object mapping file names to hashes in the runtime script.
* @param mapBeforeJs Whether to filter runtime script objects by whether they are before a `".js"` string literal.
*/
export default async function getRuntimeScripts(url: string, hashes: Record<string, string>, mapIndex = 1, mapBeforeJs = true): Promise<string[]> {
export default async function getRuntimeScripts(url: string, hashes: HashCache, mapIndex = 1, mapBeforeJs = true): Promise<string[]> {
log("looking for scripts with runtime script");

// Fetch runtime script
Expand All @@ -26,11 +26,7 @@ export default async function getRuntimeScripts(url: string, hashes: Record<stri

// Convert object to script array
return scriptObjEntries.filter(([name, hash]) => {
if (hashes[name] === hash) {
hashesLog("skipping %s as its hash has not changed", name);
return false;
}
return true;
return !hashes.isCached(name, hash);
}).map(([name, hash]) => {
return "https://www.redditstatic.com/desktop2x/" + name + "." + hash + ".js";
});
Expand Down
8 changes: 4 additions & 4 deletions src/util/get-scripts.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import { Browser, SetCookie } from "puppeteer";
import { hashes as hashesLog, log } from "./log";

import { HashCache } from "../hash/hash-cache";
import { filter } from "./filter";
import { log } from "./log";
import { userAgent } from "./user-agent";

const baseURL = "https://new.reddit.com";
Expand All @@ -22,7 +23,7 @@ const noLoadResources = [
* @param cache Whether the browser cache should be enabled.
* @returns The script URLs.
*/
export default async function getScripts(browser: Browser, hashes: Record<string, string>, sessionCookie: SetCookie, cache = false): Promise<string[]> {
export default async function getScripts(browser: Browser, hashes: HashCache, sessionCookie: SetCookie, cache = false): Promise<string[]> {
const page = await browser.newPage();
await page.setUserAgent(userAgent);
await page.setJavaScriptEnabled(false);
Expand Down Expand Up @@ -62,8 +63,7 @@ export default async function getScripts(browser: Browser, hashes: Record<string
if (match === null) return false;

// Ignore script if it has the same hash as saved before
if (hashes[match[1]] === match[2]) {
hashesLog("skipping %s as its hash has not changed", match[1]);
if (hashes.isCached(match[1], match[2])) {
return false;
}

Expand Down

0 comments on commit d52bf68

Please sign in to comment.