Files
blowfish/scripts/seed-firebase-views.js
2026-01-12 17:42:10 +00:00

323 lines
11 KiB
JavaScript

#!/usr/bin/env node
/**
* Import Google Analytics pageview data into Firestore
*
* Usage:
* 1. Export CSV from GA4: Reports → Engagement → Pages and screens → Download CSV
* 2. Get Firebase service account key from Firebase Console → Project Settings → Service accounts
* 3. Run: node seed-firebase-views.js <path-to-csv>
*
* The script maps GA page paths to Blowfish document IDs:
* /docs/configuration/ → views_docs-configuration-index.md
*/
import { initializeApp, cert } from 'firebase-admin/app';
import { getFirestore } from 'firebase-admin/firestore';
import { createReadStream, existsSync, readFileSync } from 'fs';
import { parse } from 'csv-parse';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
// Configuration
const SERVICE_ACCOUNT_PATH = join(__dirname, 'serviceAccountKey.json');
const COLLECTION_NAME = 'views';
const DRY_RUN = process.argv.includes('--dry-run');
const FORCE = process.argv.includes('--force'); // Overwrite existing documents
// Check for required files
if (!existsSync(SERVICE_ACCOUNT_PATH)) {
console.error('Error: serviceAccountKey.json not found in scripts directory');
console.error('Download it from Firebase Console → Project Settings → Service accounts');
process.exit(1);
}
const csvPath = process.argv.find(arg => !arg.startsWith('--') && !arg.includes('node') && !arg.includes('seed-firebase-views'));
if (!csvPath) {
console.error('Usage: node seed-firebase-views.js <path-to-csv> [options]');
console.error('');
console.error('Options:');
console.error(' --dry-run Preview what would be imported without writing to Firestore');
console.error(' --force Overwrite existing documents (useful for re-importing)');
process.exit(1);
}
if (!existsSync(csvPath)) {
console.error(`Error: CSV file not found: ${csvPath}`);
process.exit(1);
}
// Initialize Firebase Admin
const serviceAccount = JSON.parse(readFileSync(SERVICE_ACCOUNT_PATH, 'utf8'));
initializeApp({
credential: cert(serviceAccount)
});
const db = getFirestore();
// Language prefixes used in Hugo multilingual setup
// These get stripped from URLs to normalize to the base path
const LANGUAGE_PREFIXES = ['en', 'fr', 'zh-cn', 'es', 'de', 'it', 'ja', 'pt-pt', 'pt-br', 'bg', 'bn', 'fa', 'he', 'hu', 'id', 'pl', 'ro', 'ru', 'th', 'tr', 'uk', 'vi', 'zh-tw'];
/**
* Strip language prefix from a URL path
* /pt-pt/docs/configuration/ → /docs/configuration/
* /docs/configuration/ → /docs/configuration/ (unchanged)
*/
function stripLanguagePrefix(pagePath) {
for (const lang of LANGUAGE_PREFIXES) {
const prefix = `/${lang}/`;
if (pagePath.startsWith(prefix)) {
return '/' + pagePath.slice(prefix.length);
}
// Also handle case where lang is at root like /en or /pt-pt
if (pagePath === `/${lang}` || pagePath === `/${lang}/`) {
return '/';
}
}
return pagePath;
}
/**
* Convert a GA page path to a Blowfish Firestore document ID
*
* Blowfish uses .File.Path which includes "index.md" for page bundles:
* GA path: /docs/configuration/
* Hugo file: docs/configuration/index.md
* Firestore ID: views_docs-configuration-index.md
*/
function pathToDocId(pagePath) {
// First strip any language prefix
pagePath = stripLanguagePrefix(pagePath);
// Remove leading/trailing slashes
let cleanPath = pagePath.replace(/^\/|\/$/g, '');
// Handle special cases
if (cleanPath === '' || cleanPath === '/') {
// Homepage: _index.md
return 'views__index.md';
}
// For section pages like /docs/, the file is docs/_index.md
// For article pages like /docs/configuration/, the file is docs/configuration/index.md
// We can tell them apart: section pages don't have a second path segment after the section
const parts = cleanPath.split('/');
if (parts.length === 1) {
// Section page like "docs" → docs/_index.md
cleanPath = `${cleanPath}-_index.md`;
} else {
// Article page like "docs/configuration" → docs/configuration/index.md → docs-configuration-index.md
cleanPath = cleanPath.replace(/\//g, '-') + '-index.md';
}
// Replace any remaining slashes with hyphens
cleanPath = cleanPath.replace(/\//g, '-');
return `views_${cleanPath}`;
}
/**
* Parse the GA4 CSV export
* GA4 exports have metadata lines starting with # before the actual CSV data
* Aggregates views from different language URLs that map to the same document
*/
async function parseGACSV(filePath) {
// Use a Map to aggregate views by document ID
const viewsByDocId = new Map();
// Track original paths for debugging
const pathsByDocId = new Map();
return new Promise((resolve, reject) => {
createReadStream(filePath)
.pipe(parse({
columns: true,
skip_empty_lines: true,
trim: true,
relax_column_count: true,
comment: '#' // Skip lines starting with #
}))
.on('data', (row) => {
// GA4 uses various column names depending on the report
const pagePath = row['Page path'] ||
row['Page path and screen class'] ||
row['Page'] ||
row['Landing page'];
const views = row['Views'] ||
row['Pageviews'] ||
row['Sessions'] ||
row['Screen views'];
if (pagePath && views) {
// Clean up the path and parse views
const cleanPath = pagePath.trim();
const viewCount = parseInt(views.replace(/,/g, ''), 10);
// Only include valid paths (skip query strings, fragments, etc.)
if (cleanPath.startsWith('/') && !isNaN(viewCount) && viewCount > 0) {
// Strip language prefix to get normalized path
const normalizedPath = stripLanguagePrefix(cleanPath);
// Check if this is a page we want to import
// Include docs, samples, and other sections relevant to blowfish.page
const validPrefixes = ['/docs/', '/samples/', '/users/', '/contributors/', '/tags/', '/authors/'];
const isValidPath = normalizedPath === '/' ||
validPrefixes.some(p => normalizedPath.startsWith(p)) ||
validPrefixes.some(p => normalizedPath === p.slice(0, -1)); // /docs without trailing slash
if (isValidPath) {
const docId = pathToDocId(cleanPath);
// Take the MAX views (not sum) - handles duplicate URLs like with/without trailing slash
const existingViews = viewsByDocId.get(docId) || 0;
viewsByDocId.set(docId, Math.max(existingViews, viewCount));
// Track paths for debugging
const existingPaths = pathsByDocId.get(docId) || [];
existingPaths.push({ path: cleanPath, views: viewCount });
pathsByDocId.set(docId, existingPaths);
}
}
}
})
.on('end', () => {
// Convert Map to array
const results = [];
for (const [docId, totalViews] of viewsByDocId) {
const paths = pathsByDocId.get(docId);
results.push({
docId,
views: totalViews,
sourcePaths: paths // For debugging - shows which URLs were aggregated
});
}
// Sort by views descending
results.sort((a, b) => b.views - a.views);
resolve(results);
})
.on('error', reject);
});
}
/**
* Import data to Firestore
*/
async function importToFirestore(data) {
console.log(`\nImporting ${data.length} documents to Firestore...`);
if (FORCE) {
console.log('FORCE mode: will overwrite existing documents\n');
}
// Use batched writes for efficiency (max 500 per batch)
const batchSize = 500;
let imported = 0;
let updated = 0;
let skipped = 0;
for (let i = 0; i < data.length; i += batchSize) {
const batch = db.batch();
const chunk = data.slice(i, i + batchSize);
for (const item of chunk) {
const docRef = db.collection(COLLECTION_NAME).doc(item.docId);
// Check if document already exists
const existing = await docRef.get();
if (existing.exists) {
if (FORCE) {
batch.set(docRef, { views: item.views });
updated++;
} else {
console.log(` Skipping ${item.docId} (already exists with ${existing.data().views} views)`);
skipped++;
continue;
}
} else {
batch.set(docRef, { views: item.views });
imported++;
}
}
if (!DRY_RUN) {
await batch.commit();
}
console.log(` Processed ${Math.min(i + batchSize, data.length)}/${data.length}`);
}
return { imported, updated, skipped };
}
// Main execution
async function main() {
console.log('Google Analytics to Firestore Import');
console.log('====================================');
if (DRY_RUN) {
console.log('DRY RUN MODE - No changes will be made');
}
if (FORCE) {
console.log('FORCE MODE - Will overwrite existing documents');
}
console.log('');
console.log(`Reading CSV: ${csvPath}`);
const data = await parseGACSV(csvPath);
if (data.length === 0) {
console.error('\nNo valid data found in CSV. Check the file format.');
console.error('Expected columns: "Page path" (or similar) and "Views" (or similar)');
process.exit(1);
}
console.log(`\nFound ${data.length} unique pages to import:`);
console.log('');
// Show preview with aggregation info
const preview = data.slice(0, 15);
for (const item of preview) {
console.log(` ${item.docId}`);
console.log(` Views: ${item.views.toLocaleString()}`);
if (item.sourcePaths.length > 1) {
console.log(` (max from ${item.sourcePaths.length} URL variants):`);
for (const sp of item.sourcePaths.slice(0, 3)) {
console.log(` - ${sp.path}: ${sp.views.toLocaleString()}`);
}
if (item.sourcePaths.length > 3) {
console.log(` ... and ${item.sourcePaths.length - 3} more`);
}
}
}
if (data.length > 15) {
console.log(` ... and ${data.length - 15} more`);
}
// Show total views to verify parsing
const totalViews = data.reduce((sum, item) => sum + item.views, 0);
console.log(`\nTotal views across all pages: ${totalViews.toLocaleString()}`)
if (DRY_RUN) {
console.log('\n✓ Dry run complete. Run without --dry-run to import.');
return;
}
const { imported, updated, skipped } = await importToFirestore(data);
console.log('\n====================================');
console.log(`✓ Import complete!`);
console.log(` New documents: ${imported}`);
if (updated > 0) {
console.log(` Updated (--force): ${updated}`);
}
if (skipped > 0) {
console.log(` Skipped (already exist): ${skipped}`);
}
}
main().catch((error) => {
console.error('Error:', error.message);
process.exit(1);
});