Skip to content

Commit

Permalink
1. Refactor scylla model; 2. support gracefuldown
Browse files Browse the repository at this point in the history
  • Loading branch information
zensh committed Jul 19, 2023
1 parent 1dff3c6 commit f60960f
Show file tree
Hide file tree
Showing 19 changed files with 974 additions and 1,311 deletions.
5 changes: 5 additions & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
src/**/*.test.ts
node_modules
coverage
dist
debug
29 changes: 29 additions & 0 deletions .eslintrc.cjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/* eslint-env node */
module.exports = {
env: {
browser: true,
es2021: true
},
extends: [
"plugin:@typescript-eslint/recommended-type-checked",
"plugin:@typescript-eslint/stylistic-type-checked"
],
plugins: ["@typescript-eslint"],
parser: "@typescript-eslint/parser",
parserOptions: {
project: ["./tsconfig.eslint.json"],
ecmaVersion: "latest",
sourceType: "module"
},
rules: {
// Note: you must disable the base rule as it can report incorrect errors
"space-before-function-paren": "off",
"@typescript-eslint/space-before-function-paren": "off",
"@typescript-eslint/no-explicit-any": "off",
"@typescript-eslint/no-unsafe-assignment": "off",
"@typescript-eslint/no-unsafe-return": "off",
"@typescript-eslint/no-unsafe-member-access": "off",
"@typescript-eslint/no-unsafe-argument": "off",
},
root: true,
}
16 changes: 0 additions & 16 deletions .eslintrc.json

This file was deleted.

1 change: 1 addition & 0 deletions config/default.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"port": 8080,
"gracefulShutdown": 5,
"proxyUrls": [],
"scylladb": {
"username": "",
Expand Down
33 changes: 12 additions & 21 deletions cql/schema.cql
Original file line number Diff line number Diff line change
@@ -1,26 +1,17 @@
-- hid BLOB, # SHA1(unique Url)
-- at BIGINT, # create at, a timestamp (date and time) with second precision.
-- url TEXT, # unique Url, generated by crawlee
-- src TEXT, # source Url
-- title TEXT, # document title
-- meta MAP<text, text>, # document meta info. e.g. author, keywords, description, etc.
-- cbor BLOB, # document content in CBOR format
-- html TEXT, # document content in html format
-- page TEXT, # full page content in html format
CREATE TABLE IF NOT EXISTS doc (
hid BLOB,
at BIGINT,
url TEXT,
src TEXT,
title TEXT,
meta MAP<text, text>,
cbor BLOB,
html TEXT,
page TEXT,
PRIMARY KEY (hid, at)
) WITH CLUSTERING ORDER BY (at DESC)
AND caching = {'enabled': 'true'}
id BLOB, -- document id, 12 bytes XID, https://github.com/yiwen-ai/xid-ts
url TEXT, -- unique Url, generated by crawlee
src TEXT, -- source Url
title TEXT, -- document title
meta MAP<text, text>, -- document meta info. e.g. author, keywords, description, etc.
content BLOB, -- document content in CBOR format
html TEXT, -- document content in html format
page TEXT, -- full page content in html format
PRIMARY KEY (id)
) WITH caching = {'enabled': 'true'}
AND comment = 'scraped documents from web'
AND compaction = {'class': 'SizeTieredCompactionStrategy'}
AND compression = {'sstable_compression': 'LZ4Compressor'}
AND default_time_to_live = 0;

CREATE INDEX doc_url ON doc (url);
66 changes: 27 additions & 39 deletions dist/api.js
Original file line number Diff line number Diff line change
@@ -1,53 +1,50 @@
import { format } from 'node:util';
import { URL } from 'node:url';
import { Xid } from 'xid-ts';
import { LogLevel, createLog, logError, writeLog } from './log.js';
import { scraping } from './crawler.js';
import { parseHTMLDocument } from './tiptap.js';
import { Document } from './db/model.js';
import { DocumentModel } from './db/model.js';
const serverStartAt = Date.now();
export async function versionAPI(ctx) {
export function versionAPI(ctx) {
ctx.body = {
result: {
name: 'webscraper'
}
};
}
export async function healthzAPI(ctx) {
const s = ctx.app.context.db.getState();
export function healthzAPI(ctx) {
const db = ctx.app.context.db;
const s = db.getState();
ctx.body = {
result: {
start: serverStartAt,
hosts: s._hosts.length,
openConnections: s._openConnections,
inFlightQueries: s._inFlightQueries
scylla: s.toString(),
}
};
}
export async function scrapingAPI(ctx) {
const { db } = ctx.app.context;
const db = ctx.app.context.db;
const { url } = ctx.request.query;
if (!isValidUrl(url)) {
ctx.throw(400, format('Invalid scraping URL: %s', url));
}
const doc = await Document.findLatest(db, url);
const doc = await DocumentModel.findLatest(db, url);
if (doc.isFresh) {
// a fresh document is a document that has been scraped within the last 3600 seconds
ctx.body = {
readyAfter: 0,
result: {
id: doc.id.toString('base64url'),
url: doc.row.url
}
retry: 0,
result: doc.toJSON()
};
return;
}
const acquired = await doc.acquire(db);
if (!acquired) {
// fail to get the document scraping lock, it's being scraped by another process
ctx.body = {
readyAfter: 3,
retry: 1,
result: {
id: doc.id.toString('base64url'),
id: doc.row.id,
url: doc.row.url
}
};
Expand All @@ -62,15 +59,15 @@ export async function scrapingAPI(ctx) {
doc.setTitle(d.title);
doc.setMeta(d.meta);
doc.setPage(d.page);
doc.setCBOR(res.json);
doc.setContent(res.json);
doc.setHTML(res.html);
await doc.save(db);
log.url = d.url;
log.title = d.title;
log.meta = d.meta;
log.pageLength = d.page.length;
log.htmlLength = res.html.length;
log.cborLength = doc.row.cbor?.length;
log.cborLength = doc.row.content?.length;
log.elapsed = Date.now() - log.start;
writeLog(log);
}).catch(async (err) => {
Expand All @@ -80,43 +77,34 @@ export async function scrapingAPI(ctx) {
logError(err);
});
ctx.body = {
readyAfter: 2,
retry: 2,
result: {
id: doc.id.toString('base64url'),
id: doc.row.id,
url: doc.row.url
}
};
}
export async function documentAPI(ctx) {
const { db } = ctx.app.context;
const { id, url, output } = ctx.request.query;
let doc;
if (typeof id === 'string' && id !== '') {
const idBuf = Buffer.from(id, 'base64url');
if (idBuf.length === 28) {
doc = Document.fromId(idBuf);
}
}
else if (isValidUrl(url)) {
doc = await Document.findLatest(db, url);
const { id, output } = ctx.request.query;
let xid = null;
try {
xid = Xid.fromValue(id);
}
if (doc == null) {
ctx.throw(400, format('invalid document id %s or url %s', id, url));
catch {
ctx.throw(404, format('invalid document id %s', id));
}
let selectColumns = ['url', 'src', 'title', 'meta', 'meta', 'cbor'];
const doc = new DocumentModel(xid);
let selectColumns = ['url', 'src', 'title', 'meta', 'content'];
if (output === 'basic') { // 'basic', 'detail', 'full'
selectColumns = ['url', 'src', 'title', 'meta'];
}
else if (output === 'full') {
selectColumns = ['url', 'src', 'title', 'meta', 'cbor', 'html', 'page'];
selectColumns = ['url', 'src', 'title', 'meta', 'content', 'html', 'page'];
}
await doc.fill(db, selectColumns);
ctx.body = {
result: {
id: doc.id.toString('base64url'),
url: doc.row.url,
doc: doc.toJSON()
}
result: doc.row
};
}
function isValidUrl(url) {
Expand Down
17 changes: 8 additions & 9 deletions dist/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ export async function initApp(app) {
router.use(initContext);
router.get('/', versionAPI);
router.get('/healthz', healthzAPI);
router.get('/scraping', scrapingAPI);
router.get('/document', documentAPI);
router.get('/v1/scraping', scrapingAPI);
router.get('/v1/document', documentAPI);
app.use(router.routes());
app.use(router.allowedMethods());
}
Expand Down Expand Up @@ -49,7 +49,7 @@ async function initContext(ctx, next) {
writeLog(errLog);
}
log.msg = err.message;
ctx.status = err.status == null ? 500 : err.status;
ctx.status = err.status ?? 500;
ctx.body = {
error: {
code: err.code,
Expand All @@ -61,11 +61,11 @@ async function initContext(ctx, next) {
finally {
// log when the response is finished or closed, whichever happens first.
const { res } = ctx;
const onfinish = done.bind(null, 'finish');
const onclose = done.bind(null, 'close');
const onfinish = done.bind(null);
const onclose = done.bind(null);
res.once('finish', onfinish);
res.once('close', onclose);
function done(_event) {
function done() {
res.removeListener('finish', onfinish);
res.removeListener('close', onclose);
log.length = ctx.length;
Expand All @@ -79,13 +79,12 @@ async function initContext(ctx, next) {
if (body != null && typeof body === 'object') {
if (acceptCBOR) {
body = encode(body);
// console.log(body.toString('hex'))
ctx.set('content-length', body.length);
ctx.set('content-length', String(body.length));
ctx.set('content-type', 'application/cbor');
}
else {
body = Buffer.from(JSON.stringify(body), 'utf8');
ctx.set('content-length', body.length);
ctx.set('content-length', String(body.length));
ctx.set('content-type', 'application/json');
}
if (body.length > GZIP_MIN_LENGTH && ctx.acceptsEncodings('gzip') === 'gzip') {
Expand Down
4 changes: 2 additions & 2 deletions dist/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export async function scraping(url) {
reject = _reject;
});
const crawler = new CheerioCrawler({
requestHandler: async ({ request, $ }) => {
requestHandler: ({ request, $ }) => {
let articleTitle = $('h1');
if (articleTitle.length === 0) {
articleTitle = $('h2');
Expand Down Expand Up @@ -52,7 +52,7 @@ export async function scraping(url) {
doc.html = articleContent.html();
resolve(doc);
},
failedRequestHandler: async ({ request }) => {
failedRequestHandler: ({ request }) => {
const msg = request.errorMessages.map((str) => {
let i = str.indexOf('\n');
if (i === -1)
Expand Down
Loading

0 comments on commit f60960f

Please sign in to comment.