302 lines
9.1 KiB
PHP
302 lines
9.1 KiB
PHP
#!/usr/bin/env php
|
|
<?php
|
|
/**
|
|
* Analytics Verification Script
|
|
*
|
|
* This script helps verify the accuracy of analytics data by:
|
|
* - Comparing raw visit counts with summary totals
|
|
* - Identifying potential issues (bots, duplicates, etc.)
|
|
* - Validating data integrity
|
|
* - Showing discrepancies
|
|
*/
|
|
|
|
// Timezone: where reality meets server configuration
|
|
date_default_timezone_set('UTC'); // Adjust to your server's actual timezone
|
|
|
|
$dataDir = '/var/www/data/analytics';
|
|
$date = isset($argv[1]) ? $argv[1] : date('Y-m-d');
|
|
|
|
echo "=== Analytics Verification for {$date} ===\n\n";
|
|
|
|
// Load summary
|
|
$summaryFile = $dataDir . '/summary_' . $date . '.json';
|
|
$summary = [];
|
|
if (file_exists($summaryFile)) {
|
|
$summary = json_decode(file_get_contents($summaryFile), true) ?: [];
|
|
} else {
|
|
echo "ERROR: Summary file not found: {$summaryFile}\n";
|
|
exit(1);
|
|
}
|
|
|
|
// Load raw visits
|
|
$visitsFile = $dataDir . '/visits_' . $date . '.json';
|
|
$visits = [];
|
|
if (file_exists($visitsFile)) {
|
|
$visits = json_decode(file_get_contents($visitsFile), true) ?: [];
|
|
} else {
|
|
echo "ERROR: Visits file not found: {$visitsFile}\n";
|
|
exit(1);
|
|
}
|
|
|
|
// 1. Count pageviews from raw data
|
|
$pageviews = array_filter($visits, function($v) { return $v['type'] === 'pageview'; });
|
|
$pageviewCount = count($pageviews);
|
|
|
|
// 2. Count new vs returning from raw data
|
|
$newVisitors = [];
|
|
$returningVisitors = [];
|
|
foreach ($pageviews as $visit) {
|
|
if ($visit['isNew']) {
|
|
$newVisitors[$visit['visitorId']] = true;
|
|
} else {
|
|
$returningVisitors[$visit['visitorId']] = true;
|
|
}
|
|
}
|
|
$newCount = count($newVisitors);
|
|
$returningCount = count($returningVisitors);
|
|
|
|
// 3. Recalculate hourly distribution
|
|
$byHour = array_fill(0, 24, 0);
|
|
foreach ($pageviews as $visit) {
|
|
if (isset($visit['timestamp'])) {
|
|
$hour = (int)date('H', $visit['timestamp']);
|
|
if ($hour >= 0 && $hour < 24) {
|
|
$byHour[$hour]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 4. Count shares
|
|
$shares = ['mastodon' => 0, 'bluesky' => 0, 'copy' => 0];
|
|
foreach ($visits as $visit) {
|
|
if ($visit['type'] === 'share' && isset($visit['platform'])) {
|
|
$platform = $visit['platform'];
|
|
if (isset($shares[$platform])) {
|
|
$shares[$platform]++;
|
|
}
|
|
}
|
|
}
|
|
|
|
// 5. Count RSS clicks
|
|
$rssClicks = 0;
|
|
foreach ($visits as $visit) {
|
|
if ($visit['type'] === 'rss_click') {
|
|
$rssClicks++;
|
|
}
|
|
}
|
|
|
|
// 6. Identify potential issues
|
|
$issues = [];
|
|
|
|
// Check for duplicate pageviews (same visitor, same page, within 5 seconds)
|
|
$duplicates = [];
|
|
foreach ($pageviews as $i => $visit1) {
|
|
foreach ($pageviews as $j => $visit2) {
|
|
if ($i < $j &&
|
|
$visit1['visitorId'] === $visit2['visitorId'] &&
|
|
$visit1['page'] === $visit2['page'] &&
|
|
abs($visit1['timestamp'] - $visit2['timestamp']) < 5) {
|
|
$duplicates[] = [
|
|
'visitor' => $visit1['visitorId'],
|
|
'page' => $visit1['page'],
|
|
'time1' => date('H:i:s', $visit1['timestamp']),
|
|
'time2' => date('H:i:s', $visit2['timestamp'])
|
|
];
|
|
}
|
|
}
|
|
}
|
|
|
|
if (count($duplicates) > 0) {
|
|
$issues[] = "Found " . count($duplicates) . " potential duplicate pageviews (same visitor, same page, <5s apart)";
|
|
}
|
|
|
|
// Check for suspicious user agents (common bots)
|
|
$botPatterns = [
|
|
'/bot/i', '/crawler/i', '/spider/i', '/scraper/i',
|
|
'/google/i', '/bing/i', '/yahoo/i', '/duckduckbot/i',
|
|
'/facebookexternalhit/i', '/twitterbot/i', '/linkedinbot/i'
|
|
];
|
|
$botCount = 0;
|
|
$botVisitors = [];
|
|
foreach ($visits as $visit) {
|
|
$ua = $visit['userAgent'] ?? '';
|
|
foreach ($botPatterns as $pattern) {
|
|
if (preg_match($pattern, $ua)) {
|
|
$botCount++;
|
|
$botVisitors[$visit['visitorId']] = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($botCount > 0) {
|
|
$issues[] = "Found {$botCount} visits from potential bots/crawlers (" . count($botVisitors) . " unique visitors)";
|
|
}
|
|
|
|
// Check for rapid-fire visits (potential scripted access)
|
|
$rapidVisits = [];
|
|
$visitorTimestamps = [];
|
|
foreach ($pageviews as $visit) {
|
|
$vid = $visit['visitorId'];
|
|
if (!isset($visitorTimestamps[$vid])) {
|
|
$visitorTimestamps[$vid] = [];
|
|
}
|
|
$visitorTimestamps[$vid][] = $visit['timestamp'];
|
|
}
|
|
|
|
foreach ($visitorTimestamps as $vid => $timestamps) {
|
|
sort($timestamps);
|
|
for ($i = 1; $i < count($timestamps); $i++) {
|
|
$diff = $timestamps[$i] - $timestamps[$i-1];
|
|
if ($diff < 2) { // Less than 2 seconds between pageviews
|
|
$rapidVisits[$vid] = ($rapidVisits[$vid] ?? 0) + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (count($rapidVisits) > 0) {
|
|
$issues[] = "Found " . count($rapidVisits) . " visitors with rapid-fire pageviews (<2s apart)";
|
|
}
|
|
|
|
// Check summary vs raw data discrepancies
|
|
$discrepancies = [];
|
|
|
|
if ($summary['total'] != $pageviewCount) {
|
|
$discrepancies[] = sprintf(
|
|
"Total visits mismatch: Summary=%d, Raw count=%d (diff: %+d)",
|
|
$summary['total'], $pageviewCount, $summary['total'] - $pageviewCount
|
|
);
|
|
}
|
|
|
|
if ($summary['new'] != $newCount) {
|
|
$discrepancies[] = sprintf(
|
|
"New visitors mismatch: Summary=%d, Raw count=%d (diff: %+d)",
|
|
$summary['new'], $newCount, $summary['new'] - $newCount
|
|
);
|
|
}
|
|
|
|
if ($summary['returning'] != $returningCount) {
|
|
$discrepancies[] = sprintf(
|
|
"Returning visitors mismatch: Summary=%d, Raw count=%d (diff: %+d)",
|
|
$summary['returning'], $returningCount, $summary['returning'] - $returningCount
|
|
);
|
|
}
|
|
|
|
// Compare hourly data
|
|
$hourlyDiff = false;
|
|
for ($h = 0; $h < 24; $h++) {
|
|
if ($summary['byHour'][$h] != $byHour[$h]) {
|
|
$hourlyDiff = true;
|
|
break;
|
|
}
|
|
}
|
|
if ($hourlyDiff) {
|
|
$discrepancies[] = "Hourly distribution differs between summary and raw data";
|
|
}
|
|
|
|
// Compare shares
|
|
foreach (['mastodon', 'bluesky', 'copy'] as $platform) {
|
|
$summaryShares = $summary['shares'][$platform] ?? 0;
|
|
$rawShares = $shares[$platform] ?? 0;
|
|
if ($summaryShares != $rawShares) {
|
|
$discrepancies[] = sprintf(
|
|
"Shares ({$platform}) mismatch: Summary=%d, Raw count=%d (diff: %+d)",
|
|
$summaryShares, $rawShares, $summaryShares - $rawShares
|
|
);
|
|
}
|
|
}
|
|
|
|
// Compare RSS
|
|
$summaryRss = $summary['rss'] ?? 0;
|
|
if ($summaryRss != $rssClicks) {
|
|
$discrepancies[] = sprintf(
|
|
"RSS clicks mismatch: Summary=%d, Raw count=%d (diff: %+d)",
|
|
$summaryRss, $rssClicks, $summaryRss - $rssClicks
|
|
);
|
|
}
|
|
|
|
// Display results
|
|
echo "SUMMARY DATA:\n";
|
|
echo " Total visits: " . ($summary['total'] ?? 0) . "\n";
|
|
echo " New visitors: " . ($summary['new'] ?? 0) . "\n";
|
|
echo " Returning visitors: " . ($summary['returning'] ?? 0) . "\n";
|
|
echo " RSS clicks: " . ($summary['rss'] ?? 0) . "\n";
|
|
echo " Shares: Mastodon=" . ($summary['shares']['mastodon'] ?? 0) .
|
|
", Bluesky=" . ($summary['shares']['bluesky'] ?? 0) .
|
|
", Copy=" . ($summary['shares']['copy'] ?? 0) . "\n\n";
|
|
|
|
echo "RAW DATA COUNT:\n";
|
|
echo " Total pageviews: {$pageviewCount}\n";
|
|
echo " Unique new visitors: {$newCount}\n";
|
|
echo " Unique returning visitors: {$returningCount}\n";
|
|
echo " RSS clicks: {$rssClicks}\n";
|
|
echo " Shares: Mastodon={$shares['mastodon']}, Bluesky={$shares['bluesky']}, Copy={$shares['copy']}\n";
|
|
echo " Total visits (all types): " . count($visits) . "\n\n";
|
|
|
|
if (count($discrepancies) > 0) {
|
|
echo "⚠️ DISCREPANCIES FOUND:\n";
|
|
foreach ($discrepancies as $disc) {
|
|
echo " - {$disc}\n";
|
|
}
|
|
echo "\n";
|
|
} else {
|
|
echo "✓ Summary and raw data match!\n\n";
|
|
}
|
|
|
|
if (count($issues) > 0) {
|
|
echo "⚠️ POTENTIAL ISSUES:\n";
|
|
foreach ($issues as $issue) {
|
|
echo " - {$issue}\n";
|
|
}
|
|
echo "\n";
|
|
} else {
|
|
echo "✓ No obvious issues detected.\n\n";
|
|
}
|
|
|
|
// Show top visitors
|
|
echo "TOP VISITORS (by pageview count):\n";
|
|
$visitorCounts = [];
|
|
foreach ($pageviews as $visit) {
|
|
$vid = $visit['visitorId'];
|
|
$visitorCounts[$vid] = ($visitorCounts[$vid] ?? 0) + 1;
|
|
}
|
|
arsort($visitorCounts);
|
|
$topVisitors = array_slice($visitorCounts, 0, 10, true);
|
|
foreach ($topVisitors as $vid => $count) {
|
|
$firstVisit = null;
|
|
foreach ($pageviews as $v) {
|
|
if ($v['visitorId'] === $vid) {
|
|
$firstVisit = $v;
|
|
break;
|
|
}
|
|
}
|
|
$ua = substr($firstVisit['userAgent'] ?? 'Unknown', 0, 50);
|
|
echo sprintf(" %s: %d pageviews (UA: %s...)\n", substr($vid, 0, 30), $count, $ua);
|
|
}
|
|
|
|
echo "\n";
|
|
|
|
// Show hourly breakdown
|
|
echo "HOURLY BREAKDOWN (from raw data):\n";
|
|
for ($h = 0; $h < 24; $h++) {
|
|
$count = $byHour[$h];
|
|
$bar = str_repeat('█', min(50, (int)($count / max(1, max($byHour)) * 50)));
|
|
echo sprintf(" %02d:00 %5d %s\n", $h, $count, $bar);
|
|
}
|
|
|
|
echo "\n";
|
|
|
|
// Accuracy notes
|
|
echo "ACCURACY CONSIDERATIONS:\n";
|
|
echo " ✓ Data is recalculated from raw timestamps (hourly stats are accurate)\n";
|
|
echo " ⚠ Bot traffic is NOT filtered (may inflate numbers)\n";
|
|
echo " ⚠ Ad blockers may prevent tracking (may deflate numbers)\n";
|
|
echo " ⚠ Self-visits are NOT filtered\n";
|
|
echo " ⚠ JavaScript-disabled browsers won't be tracked\n";
|
|
echo " ⚠ Privacy tools may block localStorage (affects visitor ID)\n";
|
|
echo " ⚠ New/Returning is calculated per-day, not across days\n";
|
|
echo " ⚠ Multiple tabs/devices = multiple visitors\n";
|
|
echo "\n";
|
|
|
|
?>
|