Skip to content

Commit

Permalink
Output WARC files in a warcs subdirectory
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Dec 20, 2024
1 parent 82cec20 commit a884c18
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions crawler/src/org/netpreserve/warcaroo/Storage.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.net.http.HttpClient;
import java.net.http.HttpHeaders;
import java.nio.channels.Channels;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
Expand All @@ -32,14 +33,18 @@ public class Storage implements Closeable {
private final TimeBasedEpochGenerator uuidGenerator;
private final int poolSize = 8;

public Storage(Path directory, Database db, Config config) {
public Storage(Path directory, Database db, Config config) throws IOException {
this.db = db;
this.uuidGenerator = Generators.timeBasedEpochGenerator();
warcPool = new LinkedBlockingDeque<>(poolSize);

Path warcsDir = directory.resolve("warcs");
Files.createDirectories(warcsDir);

String prefix = config.getCrawlSettings().warcPrefix();
if (prefix == null) prefix = "warcaroo";
for (int i = 0; i < poolSize; i++) {
warcPool.add(new WarcRotator(directory, prefix));
warcPool.add(new WarcRotator(warcsDir, prefix));
}
}

Expand Down

0 comments on commit a884c18

Please sign in to comment.