Skip to content

Commit

Permalink
Update converting api
Browse files Browse the repository at this point in the history
  • Loading branch information
zensh committed Aug 12, 2023
1 parent 7fbdef8 commit 4a685d3
Show file tree
Hide file tree
Showing 8 changed files with 602 additions and 508 deletions.
19 changes: 17 additions & 2 deletions dist/api.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import { format } from 'node:util';
import { URL } from 'node:url';
import { Xid } from 'xid-ts';
import { encode } from 'cborg';
import contentType from 'content-type';
import getRawBody from 'raw-body';
import createError from 'http-errors';
import { LogLevel, createLog, logError, writeLog } from './log.js';
import { scraping } from './crawler.js';
import { parseHTML, toHTML } from './tiptap.js';
import { parseHTML, toHTML, findTitle } from './tiptap.js';
import { getConverter } from './converting.js';
import { DocumentModel } from './db/model.js';
const serverStartAt = Date.now();
Expand Down Expand Up @@ -135,8 +136,22 @@ export async function convertingAPI(ctx) {
const converter = getConverter(ct.type);
const buf = await getRawBody(ctx.req, { limit: '500kb' });
try {
const doc = await converter(buf);
const content = await converter(buf);
// console.log(Buffer.from(doc).toString('hex'))
let title = findTitle(content, 1);
if (title === '') {
title = findTitle(content, 2);
}
const doc = {
id: new Xid(),
url: "",
src: "",
title: title,
meta: {},
content: Buffer.from(encode(content)),
html: "",
page: ""
};
ctx.body = {
result: doc
};
Expand Down
24 changes: 16 additions & 8 deletions dist/converting.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import { encode } from 'cborg';
import createError from 'http-errors';
import { marked } from 'marked';
import pdfjs from 'pdfjs-dist';
Expand All @@ -24,12 +23,12 @@ export function getConverter(mime) {
function convertHtml(buf) {
const html = buf.toString('utf8');
const doc = parseHTML(html);
return Promise.resolve(Buffer.from(encode(doc)));
return Promise.resolve(doc);
}
function convertMarkdown(buf) {
const html = marked.parse(buf.toString('utf8'));
const doc = parseHTML(html);
return Promise.resolve(Buffer.from(encode(doc)));
return Promise.resolve(doc);
}
async function convertPdf(buf) {
const doc = await pdfjs.getDocument(new Uint8Array(buf)).promise;
Expand All @@ -49,6 +48,7 @@ async function convertPdf(buf) {
hl.finalize();
let texts = [];
let height = 0;
let prevNode = null;
for (let item of content.items) {
item = item;
if (item.str == null) {
Expand All @@ -67,16 +67,23 @@ async function convertPdf(buf) {
if (item.hasEOL) {
const level = hl.level(height);
if (level == 0) {
node.content.push({
prevNode = {
type: 'paragraph',
content: [{
type: 'text',
text: texts.join('')
}]
};
node.content.push(prevNode);
}
else if (prevNode != null && prevNode.type === 'heading' && prevNode.attrs.level === level) {
prevNode.content.push({
type: 'text',
text: texts.join('')
});
}
else {
node.content.push({
prevNode = {
type: "heading",
attrs: {
id: null,
Expand All @@ -86,7 +93,8 @@ async function convertPdf(buf) {
type: 'text',
text: texts.join('')
}]
});
};
node.content.push(prevNode);
}
texts = [];
height = 0;
Expand All @@ -104,7 +112,7 @@ async function convertPdf(buf) {
page.cleanup();
}
const amender = new JSONDocumentAmender();
return Promise.resolve(Buffer.from(encode(amender.amendNode(node))));
return Promise.resolve(amender.amendNode(node));
}
function convertText(buf) {
const texts = buf.toString('utf8').split(/\r\n|\r|\n/);
Expand All @@ -125,7 +133,7 @@ function convertText(buf) {
});
}
const amender = new JSONDocumentAmender();
return Promise.resolve(Buffer.from(encode(amender.amendNode(node))));
return Promise.resolve(amender.amendNode(node));
}
export class HeadingLevel {
sample;
Expand Down
22 changes: 22 additions & 0 deletions dist/tiptap.js
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,28 @@ export function parseHTML(html) {
export function toHTML(doc) {
return generateHTML(doc, tiptapExtensions);
}
export function findTitle(doc, level) {
if (doc.type === 'heading') {
if (doc.attrs.level === level && doc.content != null) {
const texts = [];
for (const child of doc.content) {
if (child.type === 'text') {
texts.push(child.text);
}
}
return texts.join(' ');
}
}
else if (doc.content != null) {
for (const child of doc.content) {
const title = findTitle(child, level);
if (title !== '') {
return title;
}
}
}
return '';
}
const LOCALHOST = 'https://localhost';
function isSameOriginHref(href) {
if (typeof href === 'string') {
Expand Down
84 changes: 42 additions & 42 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "webscraper",
"version": "0.5.1",
"version": "0.5.3",
"description": "",
"private": true,
"main": "dist/main.js",
Expand Down Expand Up @@ -34,40 +34,40 @@
"@tiptap-pro/extension-emoji": "2.1.0",
"@tiptap-pro/extension-mathematics": "2.1.0",
"@tiptap-pro/extension-unique-id": "2.1.0",
"@tiptap/core": "2.1.0-rc.12",
"@tiptap/extension-blockquote": "2.1.0-rc.12",
"@tiptap/extension-bold": "2.1.0-rc.12",
"@tiptap/extension-code": "2.1.0-rc.12",
"@tiptap/extension-code-block": "2.1.0-rc.12",
"@tiptap/extension-color": "2.1.0-rc.12",
"@tiptap/extension-document": "2.1.0-rc.12",
"@tiptap/extension-font-family": "2.1.0-rc.12",
"@tiptap/extension-hard-break": "2.1.0-rc.12",
"@tiptap/extension-heading": "2.1.0-rc.12",
"@tiptap/extension-horizontal-rule": "2.1.0-rc.12",
"@tiptap/extension-image": "2.1.0-rc.12",
"@tiptap/extension-italic": "2.1.0-rc.12",
"@tiptap/extension-link": "2.1.0-rc.12",
"@tiptap/extension-list-item": "2.1.0-rc.12",
"@tiptap/extension-mention": "2.1.0-rc.12",
"@tiptap/extension-ordered-list": "2.1.0-rc.12",
"@tiptap/extension-paragraph": "2.1.0-rc.12",
"@tiptap/extension-subscript": "2.1.0-rc.12",
"@tiptap/extension-superscript": "2.1.0-rc.12",
"@tiptap/extension-table": "2.1.0-rc.12",
"@tiptap/extension-table-cell": "2.1.0-rc.12",
"@tiptap/extension-table-header": "2.1.0-rc.12",
"@tiptap/extension-table-row": "2.1.0-rc.12",
"@tiptap/extension-task-item": "2.1.0-rc.12",
"@tiptap/extension-task-list": "2.1.0-rc.12",
"@tiptap/extension-text": "2.1.0-rc.12",
"@tiptap/extension-text-align": "2.1.0-rc.12",
"@tiptap/extension-text-style": "2.1.0-rc.12",
"@tiptap/extension-typography": "2.1.0-rc.12",
"@tiptap/extension-underline": "2.1.0-rc.12",
"@tiptap/extension-youtube": "2.1.0-rc.12",
"@tiptap/html": "2.1.0-rc.12",
"@tiptap/pm": "2.1.0-rc.12",
"@tiptap/core": "2.1.0-rc.14",
"@tiptap/extension-blockquote": "2.1.0-rc.14",
"@tiptap/extension-bold": "2.1.0-rc.14",
"@tiptap/extension-code": "2.1.0-rc.14",
"@tiptap/extension-code-block": "2.1.0-rc.14",
"@tiptap/extension-color": "2.1.0-rc.14",
"@tiptap/extension-document": "2.1.0-rc.14",
"@tiptap/extension-font-family": "2.1.0-rc.14",
"@tiptap/extension-hard-break": "2.1.0-rc.14",
"@tiptap/extension-heading": "2.1.0-rc.14",
"@tiptap/extension-horizontal-rule": "2.1.0-rc.14",
"@tiptap/extension-image": "2.1.0-rc.14",
"@tiptap/extension-italic": "2.1.0-rc.14",
"@tiptap/extension-link": "2.1.0-rc.14",
"@tiptap/extension-list-item": "2.1.0-rc.14",
"@tiptap/extension-mention": "2.1.0-rc.14",
"@tiptap/extension-ordered-list": "2.1.0-rc.14",
"@tiptap/extension-paragraph": "2.1.0-rc.14",
"@tiptap/extension-subscript": "2.1.0-rc.14",
"@tiptap/extension-superscript": "2.1.0-rc.14",
"@tiptap/extension-table": "2.1.0-rc.14",
"@tiptap/extension-table-cell": "2.1.0-rc.14",
"@tiptap/extension-table-header": "2.1.0-rc.14",
"@tiptap/extension-table-row": "2.1.0-rc.14",
"@tiptap/extension-task-item": "2.1.0-rc.14",
"@tiptap/extension-task-list": "2.1.0-rc.14",
"@tiptap/extension-text": "2.1.0-rc.14",
"@tiptap/extension-text-align": "2.1.0-rc.14",
"@tiptap/extension-text-style": "2.1.0-rc.14",
"@tiptap/extension-typography": "2.1.0-rc.14",
"@tiptap/extension-underline": "2.1.0-rc.14",
"@tiptap/extension-youtube": "2.1.0-rc.14",
"@tiptap/html": "2.1.0-rc.14",
"@tiptap/pm": "2.1.0-rc.14",
"cassandra-driver": "^4.6.4",
"cborg": "^2.0.3",
"cheerio": "1.0.0-rc.12",
Expand All @@ -77,15 +77,15 @@
"http-errors": "^2.0.0",
"koa": "^2.14.2",
"long": "^5.2.3",
"marked": "^7.0.0",
"marked": "^7.0.2",
"nanoid": "^4.0.2",
"pdfjs-dist": "^3.9.179",
"playwright": "^1.36.2",
"playwright": "^1.37.0",
"prosemirror-model": "^1.19.3",
"raw-body": "^2.5.2",
"uuid": "^9.0.0",
"xid-ts": "^1.0.1",
"zeed-dom": "^0.10.6"
"zeed-dom": "^0.10.7"
},
"devDependencies": {
"@types/config": "^3.3.0",
Expand All @@ -94,11 +94,11 @@
"@types/katex": "^0.16.2",
"@types/koa": "^2.13.8",
"@types/koa__router": "^12.0.0",
"@types/node": "^18.17.3",
"@types/node": "^18.17.5",
"@types/uuid": "^9.0.2",
"@typescript-eslint/eslint-plugin": "^6.2.1",
"@typescript-eslint/parser": "^6.2.1",
"eslint": "^8.46.0",
"@typescript-eslint/eslint-plugin": "^6.3.0",
"@typescript-eslint/parser": "^6.3.0",
"eslint": "^8.47.0",
"node-fetch": "^3.3.2",
"tsup": "^7.2.0",
"typescript": "^5.1.6"
Expand Down
Loading

0 comments on commit 4a685d3

Please sign in to comment.