WebsiteTemplate/analytics/verify.php
2026-01-25 11:33:37 -04:00

302 lines
9.1 KiB
PHP

#!/usr/bin/env php
<?php
/**
* Analytics Verification Script
*
* This script helps verify the accuracy of analytics data by:
* - Comparing raw visit counts with summary totals
* - Identifying potential issues (bots, duplicates, etc.)
* - Validating data integrity
* - Showing discrepancies
*/
// Timezone: where reality meets server configuration
date_default_timezone_set('UTC'); // Adjust to your server's actual timezone
$dataDir = '/var/www/data/analytics';
$date = isset($argv[1]) ? $argv[1] : date('Y-m-d');
echo "=== Analytics Verification for {$date} ===\n\n";
// Load summary
$summaryFile = $dataDir . '/summary_' . $date . '.json';
$summary = [];
if (file_exists($summaryFile)) {
$summary = json_decode(file_get_contents($summaryFile), true) ?: [];
} else {
echo "ERROR: Summary file not found: {$summaryFile}\n";
exit(1);
}
// Load raw visits
$visitsFile = $dataDir . '/visits_' . $date . '.json';
$visits = [];
if (file_exists($visitsFile)) {
$visits = json_decode(file_get_contents($visitsFile), true) ?: [];
} else {
echo "ERROR: Visits file not found: {$visitsFile}\n";
exit(1);
}
// 1. Count pageviews from raw data
$pageviews = array_filter($visits, function($v) { return $v['type'] === 'pageview'; });
$pageviewCount = count($pageviews);
// 2. Count new vs returning from raw data
$newVisitors = [];
$returningVisitors = [];
foreach ($pageviews as $visit) {
if ($visit['isNew']) {
$newVisitors[$visit['visitorId']] = true;
} else {
$returningVisitors[$visit['visitorId']] = true;
}
}
$newCount = count($newVisitors);
$returningCount = count($returningVisitors);
// 3. Recalculate hourly distribution
$byHour = array_fill(0, 24, 0);
foreach ($pageviews as $visit) {
if (isset($visit['timestamp'])) {
$hour = (int)date('H', $visit['timestamp']);
if ($hour >= 0 && $hour < 24) {
$byHour[$hour]++;
}
}
}
// 4. Count shares
$shares = ['mastodon' => 0, 'bluesky' => 0, 'copy' => 0];
foreach ($visits as $visit) {
if ($visit['type'] === 'share' && isset($visit['platform'])) {
$platform = $visit['platform'];
if (isset($shares[$platform])) {
$shares[$platform]++;
}
}
}
// 5. Count RSS clicks
$rssClicks = 0;
foreach ($visits as $visit) {
if ($visit['type'] === 'rss_click') {
$rssClicks++;
}
}
// 6. Identify potential issues
$issues = [];
// Check for duplicate pageviews (same visitor, same page, within 5 seconds)
$duplicates = [];
foreach ($pageviews as $i => $visit1) {
foreach ($pageviews as $j => $visit2) {
if ($i < $j &&
$visit1['visitorId'] === $visit2['visitorId'] &&
$visit1['page'] === $visit2['page'] &&
abs($visit1['timestamp'] - $visit2['timestamp']) < 5) {
$duplicates[] = [
'visitor' => $visit1['visitorId'],
'page' => $visit1['page'],
'time1' => date('H:i:s', $visit1['timestamp']),
'time2' => date('H:i:s', $visit2['timestamp'])
];
}
}
}
if (count($duplicates) > 0) {
$issues[] = "Found " . count($duplicates) . " potential duplicate pageviews (same visitor, same page, <5s apart)";
}
// Check for suspicious user agents (common bots)
$botPatterns = [
'/bot/i', '/crawler/i', '/spider/i', '/scraper/i',
'/google/i', '/bing/i', '/yahoo/i', '/duckduckbot/i',
'/facebookexternalhit/i', '/twitterbot/i', '/linkedinbot/i'
];
$botCount = 0;
$botVisitors = [];
foreach ($visits as $visit) {
$ua = $visit['userAgent'] ?? '';
foreach ($botPatterns as $pattern) {
if (preg_match($pattern, $ua)) {
$botCount++;
$botVisitors[$visit['visitorId']] = true;
break;
}
}
}
if ($botCount > 0) {
$issues[] = "Found {$botCount} visits from potential bots/crawlers (" . count($botVisitors) . " unique visitors)";
}
// Check for rapid-fire visits (potential scripted access)
$rapidVisits = [];
$visitorTimestamps = [];
foreach ($pageviews as $visit) {
$vid = $visit['visitorId'];
if (!isset($visitorTimestamps[$vid])) {
$visitorTimestamps[$vid] = [];
}
$visitorTimestamps[$vid][] = $visit['timestamp'];
}
foreach ($visitorTimestamps as $vid => $timestamps) {
sort($timestamps);
for ($i = 1; $i < count($timestamps); $i++) {
$diff = $timestamps[$i] - $timestamps[$i-1];
if ($diff < 2) { // Less than 2 seconds between pageviews
$rapidVisits[$vid] = ($rapidVisits[$vid] ?? 0) + 1;
}
}
}
if (count($rapidVisits) > 0) {
$issues[] = "Found " . count($rapidVisits) . " visitors with rapid-fire pageviews (<2s apart)";
}
// Check summary vs raw data discrepancies
$discrepancies = [];
if ($summary['total'] != $pageviewCount) {
$discrepancies[] = sprintf(
"Total visits mismatch: Summary=%d, Raw count=%d (diff: %+d)",
$summary['total'], $pageviewCount, $summary['total'] - $pageviewCount
);
}
if ($summary['new'] != $newCount) {
$discrepancies[] = sprintf(
"New visitors mismatch: Summary=%d, Raw count=%d (diff: %+d)",
$summary['new'], $newCount, $summary['new'] - $newCount
);
}
if ($summary['returning'] != $returningCount) {
$discrepancies[] = sprintf(
"Returning visitors mismatch: Summary=%d, Raw count=%d (diff: %+d)",
$summary['returning'], $returningCount, $summary['returning'] - $returningCount
);
}
// Compare hourly data
$hourlyDiff = false;
for ($h = 0; $h < 24; $h++) {
if ($summary['byHour'][$h] != $byHour[$h]) {
$hourlyDiff = true;
break;
}
}
if ($hourlyDiff) {
$discrepancies[] = "Hourly distribution differs between summary and raw data";
}
// Compare shares
foreach (['mastodon', 'bluesky', 'copy'] as $platform) {
$summaryShares = $summary['shares'][$platform] ?? 0;
$rawShares = $shares[$platform] ?? 0;
if ($summaryShares != $rawShares) {
$discrepancies[] = sprintf(
"Shares ({$platform}) mismatch: Summary=%d, Raw count=%d (diff: %+d)",
$summaryShares, $rawShares, $summaryShares - $rawShares
);
}
}
// Compare RSS
$summaryRss = $summary['rss'] ?? 0;
if ($summaryRss != $rssClicks) {
$discrepancies[] = sprintf(
"RSS clicks mismatch: Summary=%d, Raw count=%d (diff: %+d)",
$summaryRss, $rssClicks, $summaryRss - $rssClicks
);
}
// Display results
echo "SUMMARY DATA:\n";
echo " Total visits: " . ($summary['total'] ?? 0) . "\n";
echo " New visitors: " . ($summary['new'] ?? 0) . "\n";
echo " Returning visitors: " . ($summary['returning'] ?? 0) . "\n";
echo " RSS clicks: " . ($summary['rss'] ?? 0) . "\n";
echo " Shares: Mastodon=" . ($summary['shares']['mastodon'] ?? 0) .
", Bluesky=" . ($summary['shares']['bluesky'] ?? 0) .
", Copy=" . ($summary['shares']['copy'] ?? 0) . "\n\n";
echo "RAW DATA COUNT:\n";
echo " Total pageviews: {$pageviewCount}\n";
echo " Unique new visitors: {$newCount}\n";
echo " Unique returning visitors: {$returningCount}\n";
echo " RSS clicks: {$rssClicks}\n";
echo " Shares: Mastodon={$shares['mastodon']}, Bluesky={$shares['bluesky']}, Copy={$shares['copy']}\n";
echo " Total visits (all types): " . count($visits) . "\n\n";
if (count($discrepancies) > 0) {
echo "⚠️ DISCREPANCIES FOUND:\n";
foreach ($discrepancies as $disc) {
echo " - {$disc}\n";
}
echo "\n";
} else {
echo "✓ Summary and raw data match!\n\n";
}
if (count($issues) > 0) {
echo "⚠️ POTENTIAL ISSUES:\n";
foreach ($issues as $issue) {
echo " - {$issue}\n";
}
echo "\n";
} else {
echo "✓ No obvious issues detected.\n\n";
}
// Show top visitors
echo "TOP VISITORS (by pageview count):\n";
$visitorCounts = [];
foreach ($pageviews as $visit) {
$vid = $visit['visitorId'];
$visitorCounts[$vid] = ($visitorCounts[$vid] ?? 0) + 1;
}
arsort($visitorCounts);
$topVisitors = array_slice($visitorCounts, 0, 10, true);
foreach ($topVisitors as $vid => $count) {
$firstVisit = null;
foreach ($pageviews as $v) {
if ($v['visitorId'] === $vid) {
$firstVisit = $v;
break;
}
}
$ua = substr($firstVisit['userAgent'] ?? 'Unknown', 0, 50);
echo sprintf(" %s: %d pageviews (UA: %s...)\n", substr($vid, 0, 30), $count, $ua);
}
echo "\n";
// Show hourly breakdown
echo "HOURLY BREAKDOWN (from raw data):\n";
for ($h = 0; $h < 24; $h++) {
$count = $byHour[$h];
$bar = str_repeat('█', min(50, (int)($count / max(1, max($byHour)) * 50)));
echo sprintf(" %02d:00 %5d %s\n", $h, $count, $bar);
}
echo "\n";
// Accuracy notes
echo "ACCURACY CONSIDERATIONS:\n";
echo " ✓ Data is recalculated from raw timestamps (hourly stats are accurate)\n";
echo " ⚠ Bot traffic is NOT filtered (may inflate numbers)\n";
echo " ⚠ Ad blockers may prevent tracking (may deflate numbers)\n";
echo " ⚠ Self-visits are NOT filtered\n";
echo " ⚠ JavaScript-disabled browsers won't be tracked\n";
echo " ⚠ Privacy tools may block localStorage (affects visitor ID)\n";
echo " ⚠ New/Returning is calculated per-day, not across days\n";
echo " ⚠ Multiple tabs/devices = multiple visitors\n";
echo "\n";
?>