Skip to content

Commit

Permalink
ongoing changes
Browse files Browse the repository at this point in the history
  • Loading branch information
pcdeadeasy committed Feb 5, 2025
1 parent 1be54dc commit faddc65
Show file tree
Hide file tree
Showing 4 changed files with 417 additions and 44 deletions.
4 changes: 2 additions & 2 deletions ts/examples/docuProc/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,21 @@
},
"dependencies": {
"aiclient": "workspace:*",
"canvas": "^3.1.0",
"chalk": "^5.3.0",
"code-processor": "workspace:*",
"dotenv": "^16.3.1",
"fast-xml-parser": "4.5.1",
"interactive-app": "workspace:*",
"knowledge-processor": "workspace:*",
"memory-providers": "workspace:*",
"pdf-parse": "1.1.1",
"pdfjs-dist": "^4.10.38",
"typeagent": "workspace:*",
"typechat": "^0.1.1",
"typescript": "^5.4.2"
},
"devDependencies": {
"@types/node": "^18.18.7",
"@types/pdf-parse": "^1.1.4",
"copyfiles": "^2.4.1",
"rimraf": "^5.0.5"
},
Expand Down
277 changes: 274 additions & 3 deletions ts/examples/docuProc/src/docuProc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import { fetchWithRetry } from "aiclient";
import path from "path";
import fs from "fs";
import { fileURLToPath } from "url";
import { getDocument, OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
import { createCanvas } from "canvas";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
Expand All @@ -29,6 +31,13 @@ interface ArxivPaper {
journal_ref?: string;
}

interface PdfChunk {
page: number;
chunkIndex: number;
text: string;
imageRefs?: string[];
}

export async function fetchArxivPapers(
query: ArxivQuery,
): Promise<any[] | undefined> {
Expand All @@ -51,7 +60,7 @@ export async function fetchArxivPapers(
const queryParams = new URLSearchParams({
search_query: `${searchPrefix}${query.searchTerm}`,
start: String(query.start ?? 0),
max_results: String(query.maxResults ?? 5),
max_results: String(query.maxResults ?? 3),
sortBy: query.sortBy ?? "relevance",
sortOrder: query.sortOrder ?? "descending",
});
Expand Down Expand Up @@ -121,6 +130,17 @@ export function printArxivPaperParsedData(papers: ArxivPaper[]) {
});
}

export async function createFolderIfNotExists(
folderPath: string,
): Promise<void> {
try {
await fs.promises.mkdir(folderPath, { recursive: true });
console.log(`Folder '${folderPath}' is ready.`);
} catch (error) {
console.error("Error creating folder:", error);
}
}

export function getValidFilename(paperId: string): string {
return paperId.replace(/\//g, "__");
}
Expand All @@ -134,7 +154,9 @@ function getPdfUrlFromId(id: string): { paperId: string; downloadUrl: string } {
return { paperId: `${pid}`, downloadUrl: `https://arxiv.org/pdf/${pid}` };
}

export async function downloadArxivPaper(paper: ArxivPaper) {
export async function downloadArxivPaper(
paper: ArxivPaper,
): Promise<string | undefined> {
const arxivInfo = getPdfUrlFromId(paper.id);

const outputDir = path.join(__dirname, "papers");
Expand Down Expand Up @@ -168,6 +190,255 @@ export async function downloadArxivPaper(paper: ArxivPaper) {
return filePath;
} catch (error) {
console.error("Error downloading paper:", error);
return null;
return undefined;
}
}

export async function extractTextChunksFromPdf(
pdfPath: string,
chunkSize: number = 4096,
): Promise<void> {
try {
let outputDir = path.join(__dirname, "papers");
const folderName = path.parse(pdfPath).name;

outputDir = path.join(outputDir, folderName);
const pagesDir = path.join(outputDir, "pages");

createFolderIfNotExists(folderName);
createFolderIfNotExists(pagesDir);

const data = new Uint8Array(fs.readFileSync(pdfPath));
const loadingTask = getDocument({ data });
const pdfDocument = await loadingTask.promise;

let chunkIndex = 0;
for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
const page = await pdfDocument.getPage(pageNum);
const textContent = await page.getTextContent();
//const operatorList = await page.getOperatorList();

let currentText = "";
const imageRefs: string[] = [];

// Chunk text
for (const item of textContent.items) {
if ("str" in item) {
currentText += item.str + " ";

if (currentText.length >= chunkSize) {
// Save the current chunk as a JSON file
const chunk: PdfChunk = {
page: pageNum,
chunkIndex,
text: currentText,
imageRefs: imageRefs,
};

const chunkFilename = `p_${pageNum}_c_${chunkIndex}.json`;
fs.writeFileSync(
path.join(pagesDir, chunkFilename),
JSON.stringify(chunk, null, 2),
);
chunkIndex++;
currentText = "";
}
}
}

// If any leftover text remains, store it as the last chunk
if (currentText.length > 0) {
const chunk: PdfChunk = {
page: pageNum,
chunkIndex,
text: currentText,
imageRefs,
};
const chunkFilename = `p_${pageNum}_c_${chunkIndex}.json`;
fs.writeFileSync(
path.join(pagesDir, chunkFilename),
JSON.stringify(chunk, null, 2),
);
}
}
} catch (error) {
console.error("Error extracting text and images:", error);
}
}

export async function extractTextAndImages(
pdfPath: string,
chunkSize: number = 4096,
) {
try {
let outputDir = path.join(__dirname, "papers");
const folderName = path.parse(pdfPath).name;
outputDir = path.join(outputDir, folderName);

// Ensure folder exists
if (!fs.existsSync(outputDir))
fs.mkdirSync(outputDir, { recursive: true });
if (!fs.existsSync(path.join(outputDir, "images")))
fs.mkdirSync(path.join(outputDir, "images"));

const data = new Uint8Array(fs.readFileSync(pdfPath));
const loadingTask = getDocument({ data });
const pdfDocument = await loadingTask.promise;

console.log("PDF loaded");
let chunks: PdfChunk[] = [];
let currentText = "";
let chunkIndex = 0;

// Process each page of the PDF
for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
const page = await pdfDocument.getPage(pageNum);
const textContent = await page.getTextContent();
const operatorList = await page.getOperatorList();
const imageRefs: string[] = [];

// Extract text
for (const item of textContent.items) {
if ("str" in item) {
currentText += item.str + " ";
if (currentText.length >= chunkSize) {
chunks.push({
text: currentText,
page: pageNum,
chunkIndex,
imageRefs,
});
currentText = "";
chunkIndex++;
}
}
}

extractImagesFromPage(operatorList, page, pageNum, outputDir);
}

if (currentText.length > 0) {
chunks.push({
text: currentText,
page: pdfDocument.numPages,
chunkIndex,
imageRefs: [],
});
}

return chunks;
} catch (error) {
console.error("Error extracting text and images:", error);
return [];
}
}

export async function extractImagesFromPage(
operatorList: any,
page: any,
pageNum: number,
outputDir: string,
): Promise<string[]> {
const imageRefs: string[] = [];
const viewport = page.getViewport({ scale: 1.0 });
const scaleFactor = viewport.width / page.getViewport({ scale: 1.0 }).width;

for (let i = 0; i < operatorList.fnArray.length; i++) {
if (operatorList.fnArray[i] === OPS.paintImageXObject) {
const imageName = operatorList.argsArray[i][0];

try {
const image = await new Promise<any>((resolve, reject) => {
page.objs.get(imageName, (img: any) => {
if (img) resolve(img);
else reject(new Error(`Image ${imageName} not ready`));
});
});

if (image) {
const { width, height, data } = image;

const scaledWidth = width * scaleFactor;
const scaledHeight = height * scaleFactor;

const canvas = createCanvas(scaledWidth, scaledHeight);
const ctx = canvas.getContext("2d");

const imageData = ctx.createImageData(
scaledWidth,
scaledHeight,
);
imageData.data.set(new Uint8ClampedArray(data));
ctx.putImageData(imageData, 0, 0);

// Save as PNG
const imageFilename = `image_p${pageNum}_${i}.png`;
const imagePath = path.join(
outputDir,
"images",
imageFilename,
);
fs.writeFileSync(imagePath, canvas.toBuffer("image/png"));

imageRefs.push(imageFilename);
}
} catch (err: any) {
console.warn(
`Skipping unresolved image ${imageName} on page ${pageNum}: ${err.message}`,
);
}
}
}

return imageRefs;
}

export async function extractImagesFromPageV1(
operatorList: any,
page: any,
pageNum: number,
outputDir: string,
): Promise<string[]> {
const imageRefs: string[] = [];

for (let i = 0; i < operatorList.fnArray.length; i++) {
if (operatorList.fnArray[i] === OPS.paintImageXObject) {
const imageName = operatorList.argsArray[i][0];

try {
const image = await new Promise<any>((resolve, reject) => {
page.objs.get(imageName, (img: any) => {
if (img) resolve(img);
else reject(new Error(`Image ${imageName} not ready`));
});
});

if (image) {
const { width, height, data } = image;
const canvas = createCanvas(width, height);
const ctx = canvas.getContext("2d");

const imageData = ctx.createImageData(width, height);
imageData.data.set(new Uint8ClampedArray(data));
ctx.putImageData(imageData, 0, 0);

const imageFilename = `image_p${pageNum}_${i}.png`;
const imagePath = path.join(
outputDir,
"images",
imageFilename,
);
fs.writeFileSync(imagePath, canvas.toBuffer("image/png"));

imageRefs.push(imageFilename);
}
} catch (err: any) {
console.warn(
`Skipping unresolved image ${imageName} on page ${pageNum}: ${err.message}`,
);
}
}
}

return imageRefs;
}
9 changes: 7 additions & 2 deletions ts/examples/docuProc/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import dotenv from "dotenv";
import {
downloadArxivPaper,
fetchArxivPapers,
extractTextChunksFromPdf,
printArxivPaperParsedData,
} from "./docuProc.js";

Expand All @@ -15,7 +16,7 @@ console.log("Lets start processing your documents ...");
const papers: any[] | undefined = await fetchArxivPapers({
searchTerm: "transformer",
searchField: "title",
maxResults: 3,
maxResults: 1,
});
if (papers !== undefined && papers.length > 0) {
console.log(`Found ${papers.length} papers`);
Expand All @@ -25,7 +26,11 @@ if (papers !== undefined && papers.length > 0) {
printArxivPaperParsedData(papers);
papers.forEach(async (paper) => {
try {
await downloadArxivPaper(paper);
const pdfFilePath: string | undefined =
await downloadArxivPaper(paper);
if (pdfFilePath !== undefined) {
await extractTextChunksFromPdf(pdfFilePath);
}
} catch (error) {
console.error("Error downloading paper:", error);
}
Expand Down
Loading

0 comments on commit faddc65

Please sign in to comment.