-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractor.ts
137 lines (116 loc) · 3.93 KB
/
extractor.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import { PDFDocument, PDFName, PDFDict, PDFRawStream, PDFRef } from "pdf-lib";
import fs from "fs";
import path from "path";
import pdf from "pdf-parse";
// Define the structure of the table entry
interface TableEntry {
number: string;
description: string;
model: string;
material: string;
note: string;
}
// Function to parse the PDF and extract the images
async function extractImagesFromPDF(pdfPath: string): Promise<void> {
try {
// Load the PDF document
const pdfBytes = fs.readFileSync(pdfPath);
const pdfDoc = await PDFDocument.load(pdfBytes);
// Extract text from the PDF
const data = await pdf(pdfBytes);
const text = data.text;
// Log extracted text for debugging
console.log("Extracted text:", text);
// Parse the text to extract table entries
const table = parseTableEntries(text);
// Log parsed table entries for debugging
console.log("Parsed table entries:", table);
// Create the images directory if it doesn't exist
const imagesDir = path.join(__dirname, "images");
if (!fs.existsSync(imagesDir)) {
fs.mkdirSync(imagesDir);
}
// Iterate through each page of the PDF
let imageCount = 0;
for (let i = 0; i < pdfDoc.getPageCount(); i++) {
const page = pdfDoc.getPage(i);
const resources = page.node.get(PDFName.of("Resources")) as PDFDict;
const xObject = resources?.get(PDFName.of("XObject")) as PDFDict;
if (xObject) {
const imageKeys = xObject.keys();
for (let j = 0; j < imageKeys.length; j++) {
const imageKey = imageKeys[j];
const imageRef = xObject.get(imageKey) as PDFRef;
const image = pdfDoc.context.lookup(imageRef) as PDFRawStream;
if (image) {
const imageBytes = image.contents;
const tableEntry = table[imageCount];
if (tableEntry) {
const imageName = `${tableEntry.number}.webp`;
// Save the image
fs.writeFileSync(path.join(imagesDir, imageName), imageBytes);
console.log(`Saved image: ${imageName}`);
} else {
console.warn(
`No table entry found for image at page ${i + 1}, index ${j}`
);
}
imageCount++;
}
}
}
}
console.log("Images extracted and saved successfully.");
} catch (error) {
console.error("Error extracting images:", error);
}
}
// Function to parse the table entries from the text content
function parseTableEntries(text: string): TableEntry[] {
const table: TableEntry[] = [];
const lines = text.split("\n");
let currentEntry: Partial<TableEntry> = {};
let inDescription = false;
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith("OKING-SU-A9-")) {
if (currentEntry.number) {
table.push(currentEntry as TableEntry);
}
currentEntry = {
number: line,
description: "",
model: "",
material: "",
note: "",
};
inDescription = true;
} else if (inDescription) {
if (line.startsWith("For Toyota GR Supra")) {
currentEntry.model = line;
inDescription = false;
} else {
currentEntry.description +=
(currentEntry.description ? " " : "") + line;
}
} else if (
line.startsWith("Carbon Fiber") ||
line.startsWith("Forged Carbon Fiber") ||
line.startsWith("ABS") ||
line.startsWith("Dry Carbon Fiber")
) {
currentEntry.material = line;
} else if (line.startsWith("US$")) {
currentEntry.note = line;
}
}
if (currentEntry.number) {
table.push(currentEntry as TableEntry);
}
return table;
}
// Example usage
const pdfPath = "./Wholesale Price List For Supra -OKING.pdf";
extractImagesFromPDF(pdfPath)
.then(() => console.log("Images extracted and saved successfully."))
.catch((err) => console.error("Error extracting images:", err));