algolia上传脚本
This commit is contained in:
446
scripts/algolia-upload.mjs
Normal file
446
scripts/algolia-upload.mjs
Normal file
@@ -0,0 +1,446 @@
|
||||
import { readFile, readdir } from 'node:fs/promises';
|
||||
import { resolve, join } from 'node:path';
|
||||
import matter from 'gray-matter';
|
||||
import { algoliasearch } from 'algoliasearch';
|
||||
import { fileURLToPath } from 'url';
|
||||
import path from 'path';
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
const docsDir = resolve(__dirname, '../docs');
|
||||
|
||||
// 加载环境变量
|
||||
await import('dotenv').then(r => r.config());
|
||||
|
||||
// 配置参数
|
||||
const ALGOLIA_APP_ID = process.env.ALGOLIA_APP_ID;
|
||||
const ALGOLIA_API_KEY = process.env.ALGOLIA_ADMIN_KEY;
|
||||
const INDEX_NAME = 'netease-modsdk';
|
||||
const MAX_RECORD_SIZE = 8000; // 字节,保守一点小于10000的限制
|
||||
|
||||
/**
|
||||
* 递归获取所有 Markdown 文件
|
||||
*/
|
||||
async function getAllMarkdownFiles(dir, base = '') {
|
||||
const files = [];
|
||||
const entries = await readdir(dir, { withFileTypes: true });
|
||||
|
||||
for (const entry of entries) {
|
||||
const fullPath = join(dir, entry.name);
|
||||
|
||||
if (entry.isDirectory()) {
|
||||
const subPath = join(base, entry.name);
|
||||
const subFiles = await getAllMarkdownFiles(fullPath, subPath);
|
||||
files.push(...subFiles);
|
||||
} else if (entry.name.endsWith('.md')) {
|
||||
// 忽略以 _ 开头的特殊文件
|
||||
if (entry.name.startsWith('_') || entry.name.includes('索引')) {
|
||||
console.log(entry.name);
|
||||
continue;
|
||||
}
|
||||
// 构建路由路径
|
||||
let routePath = join(base, entry.name.replace(/\.md$/, ''));
|
||||
// 特殊处理 index.md 文件,转换为目录路径
|
||||
if (entry.name === 'index.md') {
|
||||
routePath = base;
|
||||
}
|
||||
// 确保路径以 / 开头
|
||||
routePath = '/' + routePath;
|
||||
// 规范化路径分隔符
|
||||
routePath = routePath.replace(/\\/g, '/');
|
||||
|
||||
files.push({
|
||||
filePath: fullPath,
|
||||
path: routePath
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
import { marked } from 'marked';
|
||||
|
||||
/**
|
||||
* 使用marked解析器按照标题(H1和H2)分割Markdown内容
|
||||
* 跳过标题为"索引"的H1部分
|
||||
*/
|
||||
function splitByHeadings(content) {
|
||||
const tokens = marked.lexer(content);
|
||||
const sections = [];
|
||||
let currentH1 = null;
|
||||
let currentH2 = null;
|
||||
let currentSection = {
|
||||
h1Title: null,
|
||||
h2Title: null,
|
||||
content: '',
|
||||
anchor: '',
|
||||
level: 0
|
||||
};
|
||||
let sectionStartIndex = 0;
|
||||
let skipSection = false; // 标记是否跳过当前区块
|
||||
|
||||
// 初始处理:如果内容不是以标题开始,创建初始区块
|
||||
if (tokens.length > 0 && tokens[0].type !== 'heading') {
|
||||
let initialContent = '';
|
||||
let i = 0;
|
||||
|
||||
// 收集直到第一个标题之前的所有内容
|
||||
while (i < tokens.length && tokens[i].type !== 'heading') {
|
||||
if (tokens[i].type === 'paragraph') {
|
||||
initialContent += tokens[i].text + '\n\n';
|
||||
} else if (tokens[i].type === 'code') {
|
||||
initialContent += '```' + tokens[i].lang + '\n' + tokens[i].text + '\n```\n\n';
|
||||
} else {
|
||||
initialContent += tokens[i].raw + '\n';
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
if (initialContent.trim()) {
|
||||
sections.push({
|
||||
h1Title: null,
|
||||
h2Title: null,
|
||||
content: initialContent.trim(),
|
||||
anchor: '',
|
||||
level: 0
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 主循环:处理所有标题并分割内容
|
||||
for (let i = 0; i < tokens.length; i++) {
|
||||
const token = tokens[i];
|
||||
|
||||
if (token.type === 'heading') {
|
||||
// 遇到新标题,处理之前的内容
|
||||
if (i > sectionStartIndex && !skipSection) {
|
||||
// 如果当前内容不为空且不在跳过模式,保存当前区块
|
||||
if (currentSection.content.trim()) {
|
||||
sections.push({ ...currentSection });
|
||||
}
|
||||
}
|
||||
|
||||
// 检查是否需要跳过此部分(标题为"索引")
|
||||
skipSection = token.text === "索引" || token.text.includes("索引");
|
||||
|
||||
// 根据标题级别更新当前上下文
|
||||
if (!skipSection && token.depth === 1) {
|
||||
currentH1 = token.text;
|
||||
currentH2 = null;
|
||||
|
||||
if (!skipSection) {
|
||||
currentSection = {
|
||||
h1Title: token.text,
|
||||
h2Title: null,
|
||||
content: token.raw + '\n',
|
||||
anchor: generateAnchor(token.text),
|
||||
level: 1
|
||||
};
|
||||
} else {
|
||||
console.log(`跳过索引部分: ${token.text}`);
|
||||
currentSection = {
|
||||
h1Title: null,
|
||||
h2Title: null,
|
||||
content: '',
|
||||
anchor: '',
|
||||
level: 0
|
||||
};
|
||||
}
|
||||
} else if (!skipSection && token.depth === 2) {
|
||||
currentH2 = token.text;
|
||||
currentSection = {
|
||||
h1Title: currentH1,
|
||||
h2Title: token.text,
|
||||
content: token.raw + '\n',
|
||||
anchor: generateAnchor(token.text),
|
||||
level: 2
|
||||
};
|
||||
}
|
||||
|
||||
sectionStartIndex = i;
|
||||
} else if (!skipSection) { // 只处理非跳过模式下的内容
|
||||
// 非标题内容,添加到当前区块
|
||||
if (token.type === 'paragraph') {
|
||||
currentSection.content += token.text + '\n\n';
|
||||
} else if (token.type === 'code') {
|
||||
currentSection.content += '```' + token.lang + '\n' + token.text + '\n```\n\n';
|
||||
} else {
|
||||
currentSection.content += token.raw + '\n';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 添加最后一个区块(如果不在跳过模式)
|
||||
if (currentSection.content.trim() && !skipSection) {
|
||||
sections.push(currentSection);
|
||||
}
|
||||
|
||||
return sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* 生成VitePress兼容的锚点链接
|
||||
*/
|
||||
function generateAnchor(text) {
|
||||
return `#${text
|
||||
.toLowerCase()
|
||||
.replace(/\s+/g, '-')
|
||||
.replace(/[^\w\u4e00-\u9fa5-]/g, '')}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* 清理内容,移除代码块等
|
||||
*/
|
||||
function cleanContent(content) {
|
||||
// 移除代码块
|
||||
return content.replace(/```[\s\S]*?```/g, '');
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取目录的优先级
|
||||
* mcdocs - 最高优先级
|
||||
* mcguide - 中等优先级
|
||||
* mconline - 最低优先级
|
||||
*/
|
||||
function getDirectoryPriority(path) {
|
||||
const firstDir = path.split('/').filter(Boolean)[0];
|
||||
|
||||
switch (firstDir) {
|
||||
case 'mcdocs':
|
||||
return 3; // 最高优先级
|
||||
case 'mcguide':
|
||||
return 2; // 中等优先级
|
||||
case 'mconline':
|
||||
return 1; // 较低优先级
|
||||
default:
|
||||
return 0; // 默认优先级
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 生成符合VitePress的Algolia索引记录
|
||||
*/
|
||||
async function generateAlgoliaRecords() {
|
||||
// 获取所有 Markdown 文件
|
||||
const pages = await getAllMarkdownFiles(docsDir);
|
||||
const records = [];
|
||||
|
||||
for (const page of pages) {
|
||||
try {
|
||||
const rawContent = await readFile(page.filePath, 'utf8');
|
||||
const { data: frontmatter, content } = matter(rawContent);
|
||||
const pageTitle = frontmatter.title || page.path.split('/').pop() || page.path;
|
||||
|
||||
// 确定这个页面的优先级
|
||||
const priority = getDirectoryPriority(page.path);
|
||||
|
||||
// 按标题分割内容
|
||||
const sections = splitByHeadings(content);
|
||||
|
||||
for (const section of sections) {
|
||||
const { h1Title, h2Title, content: sectionContent, anchor, level } = section;
|
||||
const cleanedContent = cleanContent(sectionContent);
|
||||
|
||||
// 构建这个部分的层次结构
|
||||
const hierarchy = {
|
||||
lvl0: pageTitle,
|
||||
lvl1: h1Title || pageTitle,
|
||||
};
|
||||
|
||||
// 如果有H2标题,添加到层次结构
|
||||
if (h2Title) {
|
||||
hierarchy.lvl2 = h2Title;
|
||||
}
|
||||
|
||||
// 构建唯一ID和URL
|
||||
const objectID = `${page.path}${anchor}`;
|
||||
const url = `https://modsdk.easecation.net${page.path}${anchor}`;
|
||||
|
||||
// 创建记录
|
||||
const record = {
|
||||
objectID,
|
||||
url,
|
||||
type: level ? `lvl${level}` : 'content', // 标识这是什么级别的标题
|
||||
hierarchy,
|
||||
content: cleanedContent,
|
||||
_tags: ['zh-CN'],
|
||||
lang: "zh-CN",
|
||||
priority: priority // 添加优先级字段
|
||||
};
|
||||
|
||||
// 如果记录太大,则需要分割
|
||||
const recordSize = Buffer.byteLength(JSON.stringify(record), 'utf8');
|
||||
if (recordSize <= MAX_RECORD_SIZE) {
|
||||
records.push(record);
|
||||
} else {
|
||||
console.log(`部分内容过大,需要进一步分割: ${objectID}`);
|
||||
|
||||
// 按段落分割大内容
|
||||
const paragraphs = cleanedContent.split(/\n\s*\n/);
|
||||
let currentChunk = '';
|
||||
let chunkIndex = 0;
|
||||
|
||||
for (const paragraph of paragraphs) {
|
||||
if (Buffer.byteLength(currentChunk + paragraph, 'utf8') > MAX_RECORD_SIZE / 2) {
|
||||
// 当前块足够大,保存并开始新块
|
||||
if (currentChunk) {
|
||||
records.push({
|
||||
...record,
|
||||
objectID: `${objectID}-chunk-${chunkIndex}`,
|
||||
content: currentChunk,
|
||||
_tags: [...(record._tags || []), 'chunked'],
|
||||
priority: record.priority // 确保保留优先级
|
||||
});
|
||||
chunkIndex++;
|
||||
currentChunk = paragraph;
|
||||
} else {
|
||||
// 单个段落过大,需要硬分割
|
||||
currentChunk = paragraph.substring(0, 1000) + '...';
|
||||
records.push({
|
||||
...record,
|
||||
objectID: `${objectID}-chunk-${chunkIndex}`,
|
||||
content: currentChunk,
|
||||
_tags: [...(record._tags || []), 'chunked'],
|
||||
priority: record.priority // 确保保留优先级
|
||||
});
|
||||
chunkIndex++;
|
||||
currentChunk = '';
|
||||
}
|
||||
} else {
|
||||
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
|
||||
}
|
||||
}
|
||||
|
||||
// 添加最后一个块
|
||||
if (currentChunk) {
|
||||
records.push({
|
||||
...record,
|
||||
objectID: `${objectID}-chunk-${chunkIndex}`,
|
||||
content: currentChunk,
|
||||
_tags: [...(record._tags || []), 'chunked'],
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`无法处理文件 ${page.filePath}: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
/**
|
||||
* 上传数据到 Algolia
|
||||
*/
|
||||
async function uploadToAlgolia(records) {
|
||||
if (!ALGOLIA_APP_ID || !ALGOLIA_API_KEY) {
|
||||
console.error('❌ 缺少 Algolia 凭据。请检查环境变量 ALGOLIA_APP_ID 和 ALGOLIA_ADMIN_KEY');
|
||||
return;
|
||||
}
|
||||
|
||||
const client = algoliasearch(ALGOLIA_APP_ID, ALGOLIA_API_KEY);
|
||||
|
||||
try {
|
||||
console.log(`开始上传 ${records.length} 条记录到 Algolia...`);
|
||||
|
||||
// 分批上传记录以避免请求过大
|
||||
const batchSize = 50; // 每批次处理的记录数
|
||||
let successCount = 0;
|
||||
let failureCount = 0;
|
||||
|
||||
for (let i = 0; i < records.length; i += batchSize) {
|
||||
const batch = records.slice(i, i + batchSize);
|
||||
try {
|
||||
const operations = batch.map(record => ({
|
||||
action: 'updateObject',
|
||||
indexName: INDEX_NAME,
|
||||
body: record
|
||||
}));
|
||||
|
||||
await client.multipleBatch({ requests: operations });
|
||||
successCount += batch.length;
|
||||
console.log(`✅ 批次 ${Math.floor(i / batchSize) + 1}/${Math.ceil(records.length / batchSize)} 已上传 (${successCount}/${records.length})`);
|
||||
} catch (batchError) {
|
||||
console.error(`❌ 批次 ${Math.floor(i / batchSize) + 1} 上传失败:`, batchError.message);
|
||||
failureCount += batch.length;
|
||||
|
||||
// 尝试逐条上传这个批次,以识别有问题的记录
|
||||
for (const record of batch) {
|
||||
try {
|
||||
await client.addOrUpdateObject({
|
||||
indexName: INDEX_NAME,
|
||||
objectID: record.objectID,
|
||||
body: record
|
||||
});
|
||||
successCount += 1;
|
||||
failureCount -= 1;
|
||||
} catch (recordError) {
|
||||
console.error(`❌ 记录上传失败 (objectID: ${record.objectID}):`, recordError.message);
|
||||
console.error(`记录大小: ${Buffer.byteLength(JSON.stringify(record), 'utf8')} 字节`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`✅ 完成上传 - 成功: ${successCount}, 失败: ${failureCount}`);
|
||||
} catch (error) {
|
||||
console.error('❌ 上传失败:', error);
|
||||
if (error.message) console.error('错误信息:', error.message);
|
||||
if (error.status) console.error('状态码:', error.status);
|
||||
}
|
||||
}
|
||||
|
||||
async function settingAlgolia() {
|
||||
if (!ALGOLIA_APP_ID || !ALGOLIA_API_KEY) {
|
||||
console.error('❌ 缺少 Algolia 凭据。请检查环境变量 ALGOLIA_APP_ID 和 ALGOLIA_ADMIN_KEY');
|
||||
return;
|
||||
}
|
||||
|
||||
const client = algoliasearch(ALGOLIA_APP_ID, ALGOLIA_API_KEY);
|
||||
|
||||
try {
|
||||
const response = await client.setSettings({
|
||||
indexName: INDEX_NAME,
|
||||
indexSettings: {
|
||||
attributesForFaceting: ['lang', 'type'],
|
||||
attributesToHighlight: ['hierarchy.lvl0', 'hierarchy.lvl1', 'hierarchy.lvl2', 'content'],
|
||||
attributesToSnippet: ['content:20'],
|
||||
attributesToRetrieve: ['hierarchy', 'content', 'type', 'url', 'lang', 'priority'],
|
||||
searchableAttributes: [
|
||||
'hierarchy.lvl0',
|
||||
'hierarchy.lvl1',
|
||||
'hierarchy.lvl2',
|
||||
'content'
|
||||
],
|
||||
customRanking: [
|
||||
'desc(priority)', // 首先按优先级降序排列(值越高排越前)
|
||||
'asc(content.length)' // 其次按内容长度升序(简短内容优先)
|
||||
]
|
||||
},
|
||||
forwardToReplicas: true,
|
||||
});
|
||||
console.log('✅ 索引设置已更新');
|
||||
} catch (error) {
|
||||
console.error('❌ 设置失败:', error);
|
||||
}
|
||||
}
|
||||
|
||||
// 执行流程
|
||||
try {
|
||||
console.log('设置全局配置...');
|
||||
await settingAlgolia();
|
||||
|
||||
console.log('开始生成 Algolia 索引数据...');
|
||||
const records = await generateAlgoliaRecords();
|
||||
console.log(`生成了 ${records.length} 条记录`);
|
||||
|
||||
if (records.length > 0) {
|
||||
await uploadToAlgolia(records);
|
||||
} else {
|
||||
console.warn('⚠️ 没有找到文档,跳过上传');
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('❌ 处理失败:', error);
|
||||
}
|
||||
Reference in New Issue
Block a user