Skip to content

Commit

Permalink
Merge branch 'rootedbox-upgrade_doc_indexing' into preview
Browse files Browse the repository at this point in the history
  • Loading branch information
sestinj committed Mar 8, 2024
2 parents 09c24b4 + 34e8429 commit fbf55b7
Show file tree
Hide file tree
Showing 13 changed files with 1,780 additions and 961 deletions.
146 changes: 0 additions & 146 deletions core/indexing/chunk/markdown.ts

This file was deleted.

131 changes: 131 additions & 0 deletions core/indexing/docs/article.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import { JSDOM } from 'jsdom';
import { Readability } from '@mozilla/readability';
import { MAX_CHUNK_SIZE } from "../../llm/constants";
import { Chunk } from "../..";

type ArticleComponent = {
title: string;
body: string;
};

type Article = {
subpath: string;
title: string;
article_components: ArticleComponent[];
};

function breakdownArticleComponent(article: ArticleComponent, subpath: string): Chunk[] {
let chunks: Chunk[] = [];

let lines = article.body.split('\n');
let startLine = 0;
let endLine = 0;
let content = '';
let index = 0;

for (let i = 0; i < lines.length; i++) {
let line = lines[i];
if (content.length + line.length <= MAX_CHUNK_SIZE) {
content += line + '\n';
endLine = i;
} else {
chunks.push({
content: content,
startLine: startLine,
endLine: endLine,
otherMetadata: {
title: article.title
},
index: index,
filepath: subpath,
digest: subpath
});
content = line + '\n';
startLine = i;
endLine = i;
index += 1;
}
}

// Push the last chunk
if (content) {
chunks.push({
content: content,
startLine: startLine,
endLine: endLine,
otherMetadata: {
title: article.title
},
index: index,
filepath: subpath,
digest: subpath
});
}

return chunks;
}

export function chunkArticle(articleResult: Article): Chunk[] {
let chunks: Chunk[] = [];

for (let article of articleResult.article_components) {
let articleChunks = breakdownArticleComponent(article, articleResult.subpath);
chunks = [...chunks, ...articleChunks];
}

return chunks;
}

function extractTitlesAndBodies(html: string): ArticleComponent[] {
const dom = new JSDOM(html);
const document = dom.window.document;

const titles = Array.from(document.querySelectorAll('h2'));
const result = titles.map((titleElement) => {
const title = titleElement.textContent || '';
let body = '';
let nextSibling = titleElement.nextElementSibling;

while (nextSibling && nextSibling.tagName !== 'H2') {
body += nextSibling.textContent || '';
nextSibling = nextSibling.nextElementSibling;
}

return { title, body };
});

return result;
}

export async function urlToArticle(
subpath: string,
baseUrl: URL,
): Promise<Article | undefined> {
const url = new URL(subpath, baseUrl);
try {
const response = await fetch(url.toString());

if (!response.ok) {
return undefined;
}

const htmlContent = await response.text();
const dom = new JSDOM(htmlContent);
let reader = new Readability(dom.window.document);
let article = reader.parse();

if (!article) {
return undefined;
}

let article_components = extractTitlesAndBodies(article.content);
return {
subpath,
title: article.title,
article_components,
};
} catch (err) {
console.error("Error converting URL to article components", err);
return undefined;
}
}
53 changes: 13 additions & 40 deletions core/indexing/docs/index.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import {
Chunk,
ChunkWithoutID,
EmbeddingsProvider,
IndexingProgressUpdate,
} from "../..";
import { MAX_CHUNK_SIZE } from "../../llm/constants";
import { markdownChunker } from "../chunk/markdown";

import { crawlSubpages } from "./crawl";
import { addDocs, listDocs } from "./db";
import { convertURLToMarkdown } from "./urlToMarkdown";
import { urlToArticle, chunkArticle } from "./article";

export async function* indexDocs(
title: string,
Expand All @@ -31,6 +29,7 @@ export async function* indexDocs(

const subpathGenerator = crawlSubpages(baseUrl);
let { value, done } = await subpathGenerator.next();

while (true) {
if (done) {
break;
Expand All @@ -49,52 +48,26 @@ export async function* indexDocs(
const chunks: Chunk[] = [];
const embeddings: number[][] = [];

let markdownForSubpaths = await Promise.all(
subpaths.map((subpath) => convertURLToMarkdown(new URL(subpath, baseUrl))),
let articles = await Promise.all(
subpaths.map(subpath => urlToArticle(subpath, baseUrl)),
);

// Filter out undefineds
let filteredSubpaths: string[] = [];
let filteredMarkdown: string[] = [];
for (let i = 0; i < subpaths.length; i++) {
if (markdownForSubpaths[i]) {
filteredSubpaths.push(subpaths[i]);
filteredMarkdown.push(markdownForSubpaths[i]!);
}
}
subpaths = filteredSubpaths;
markdownForSubpaths = filteredMarkdown;
for (const article of articles) {
if (!article) continue;

for (let i = 0; i < subpaths.length; i++) {
const subpath = subpaths[i];
yield {
progress: Math.max(1, Math.floor(100 / (subpaths.length + 1))),
desc: `${subpath}`,
desc: `${article.subpath}`,
};

const markdown = markdownForSubpaths[i]!;
const markdownChunks: ChunkWithoutID[] = [];
for await (const chunk of markdownChunker(markdown, MAX_CHUNK_SIZE, 0)) {
markdownChunks.push(chunk);
}

const subpathEmbeddings = await embeddingsProvider.embed(
markdownChunks.map((chunk) => chunk.content),
chunkArticle(article).map(chunk => {
chunks.push(chunk);

return chunk.content;
})
);

markdownChunks.forEach((chunk, index) => {
chunks.push({
...chunk,
filepath:
subpath +
(chunk.otherMetadata?.fragment
? `#${chunk.otherMetadata.fragment}`
: ""),
otherMetadata: chunk.otherMetadata,
index,
digest: subpath,
});
});
embeddings.push(...subpathEmbeddings);
}

Expand Down

0 comments on commit fbf55b7

Please sign in to comment.