The Debug API provides specialized endpoints for monitoring memory usage, profiling performance, and diagnosing issues with crawler instances. These endpoints are essential for troubleshooting large-scale crawls and optimizing system resources.

Development Use: These endpoints are primarily intended for development, debugging, and monitoring. They expose internal system state and should be used with caution in production environments.

Endpoints

GET /api/debug/memory

Get memory usage statistics for all active crawler instances across all user sessions. Useful for monitoring system-wide resource consumption and identifying memory-heavy crawls.

Authentication

Requires valid session cookie.

Request Parameters

No request parameters required.

Example Request

curl http://localhost:5000/api/debug/memory \
  -b cookies.txt

Success Response (200 OK)

{
  "total_instances": 3,
  "instances": [
    {
      "session_id": "550e8400-e29b-41d4-a716-446655440000",
      "user_id": 1,
      "created_at": "2025-01-18T14:30:22Z",
      "last_activity": "2025-01-18T14:35:45Z",
      "status": "running",
      "base_url": "https://example.com",
      "memory_usage": {
        "current_mb": 245.8,
        "peak_mb": 312.4,
        "limit_mb": 2048,
        "usage_percent": 12.0
      },
      "stats": {
        "discovered": 1247,
        "crawled": 856,
        "depth": 4
      }
    },
    {
      "session_id": "660e8400-e29b-41d4-a716-446655440001",
      "user_id": 2,
      "created_at": "2025-01-18T14:25:10Z",
      "last_activity": "2025-01-18T14:35:50Z",
      "status": "paused",
      "base_url": "https://anothersite.com",
      "memory_usage": {
        "current_mb": 512.3,
        "peak_mb": 587.9,
        "limit_mb": 2048,
        "usage_percent": 25.0
      },
      "stats": {
        "discovered": 5420,
        "crawled": 3210,
        "depth": 7
      }
    },
    {
      "session_id": "770e8400-e29b-41d4-a716-446655440002",
      "user_id": null,
      "created_at": "2025-01-18T14:33:15Z",
      "last_activity": "2025-01-18T14:35:48Z",
      "status": "completed",
      "base_url": "https://smallsite.com",
      "memory_usage": {
        "current_mb": 78.4,
        "peak_mb": 95.2,
        "limit_mb": 2048,
        "usage_percent": 3.8
      },
      "stats": {
        "discovered": 42,
        "crawled": 42,
        "depth": 2
      }
    }
  ]
}

Response Fields

Field Type Description
total_instances number Total number of active crawler instances
instances array Array of crawler instance objects

Instance Object Fields

Field Type Description
session_id string Unique session identifier (UUID)
user_id number|null User ID (null for guest users)
created_at string ISO 8601 timestamp when instance was created
last_activity string ISO 8601 timestamp of last activity
status string Current crawl status: "running", "paused", "completed", "idle"
base_url string Starting URL of the crawl
memory_usage object Memory consumption metrics
stats object Basic crawl statistics

Use Cases

  • System Monitoring: Track total memory consumption across all users
  • Capacity Planning: Identify how many concurrent crawls the system can handle
  • Resource Allocation: Find memory-intensive crawls that may need optimization
  • Session Cleanup: Identify stale sessions that should be cleaned up
  • User Activity: Monitor which users are actively crawling
GET /api/debug/memory/profile

Get detailed memory profiling data for all crawler instances, including breakdowns by component (URL queue, crawled data, link graph, etc.). Essential for identifying memory bottlenecks in large crawls.

Authentication

Requires valid session cookie.

Request Parameters

No request parameters required.

Example Request

curl http://localhost:5000/api/debug/memory/profile \
  -b cookies.txt

Success Response (200 OK)

{
  "total_instances": 2,
  "profiles": [
    {
      "session_id": "550e8400-e29b-41d4-a716-446655440000",
      "user_id": 1,
      "status": "running",
      "base_url": "https://example.com",
      "total_memory_mb": 245.8,
      "breakdown": {
        "url_queue": {
          "size_mb": 12.5,
          "percent": 5.1,
          "item_count": 391
        },
        "crawled_urls": {
          "size_mb": 168.4,
          "percent": 68.5,
          "item_count": 856
        },
        "link_graph": {
          "size_mb": 42.3,
          "percent": 17.2,
          "item_count": 3421
        },
        "issues": {
          "size_mb": 8.7,
          "percent": 3.5,
          "item_count": 124
        },
        "visited_set": {
          "size_mb": 9.8,
          "percent": 4.0,
          "item_count": 1247
        },
        "other": {
          "size_mb": 4.1,
          "percent": 1.7
        }
      },
      "optimization_suggestions": [
        "Consider reducing exportFields to decrease crawled_urls memory",
        "URL queue is healthy (<10%)"
      ]
    },
    {
      "session_id": "660e8400-e29b-41d4-a716-446655440001",
      "user_id": 2,
      "status": "paused",
      "base_url": "https://largesite.com",
      "total_memory_mb": 1024.5,
      "breakdown": {
        "url_queue": {
          "size_mb": 156.8,
          "percent": 15.3,
          "item_count": 12450
        },
        "crawled_urls": {
          "size_mb": 687.2,
          "percent": 67.1,
          "item_count": 45230
        },
        "link_graph": {
          "size_mb": 142.1,
          "percent": 13.9,
          "item_count": 156780
        },
        "issues": {
          "size_mb": 18.5,
          "percent": 1.8,
          "item_count": 892
        },
        "visited_set": {
          "size_mb": 12.4,
          "percent": 1.2,
          "item_count": 57680
        },
        "other": {
          "size_mb": 7.5,
          "percent": 0.7
        }
      },
      "optimization_suggestions": [
        "URL queue is large (>10%), consider reducing maxUrls or maxDepth",
        "High memory usage (50%+), monitor for potential issues",
        "Consider exporting data and restarting crawl to free memory"
      ]
    }
  ]
}

Memory Breakdown Components

Component Description Optimization Tips
url_queue Pending URLs waiting to be crawled Should be <10% of total. Reduce maxDepth/maxUrls if high.
crawled_urls Data for all crawled pages (titles, meta, content, etc.) Largest component (60-70%). Reduce exportFields to save memory.
link_graph All discovered link relationships Grows with internal linking density. Disable link tracking if not needed.
issues Detected SEO and technical issues Usually small. Use exclusion patterns to reduce false positives.
visited_set Set of visited URLs (for deduplication) Minimal overhead. Uses efficient hash sets.
other Session state, settings, and overhead Should be minimal (<5%).

Use Cases

  • Memory Optimization: Identify which components consume most memory
  • Troubleshooting: Diagnose memory leaks or unexpected growth
  • Capacity Planning: Estimate memory needs for large crawls
  • Configuration Tuning: Adjust settings based on memory profiles
  • Performance Analysis: Find bottlenecks in data structures

Pro Tip: Run this endpoint periodically during large crawls to monitor memory growth trends. If crawled_urls exceeds 80% and continues growing, consider exporting intermediate results.

Memory Monitoring Best Practices

1. Set Appropriate Memory Limits

Configure memoryLimit in settings to prevent out-of-memory crashes:

// For 8GB system: Set limit to 2048MB (25% of RAM)
// For 16GB system: Set limit to 4096MB (25% of RAM)
// For 32GB system: Set limit to 8192MB (25% of RAM)

await fetch('/api/save_settings', {
  method: 'POST',
  body: JSON.stringify({ memoryLimit: 2048 })
});

2. Monitor During Large Crawls

Poll the debug endpoints every 10-30 seconds during active crawls:

async function monitorMemory() {
  const response = await fetch('/api/debug/memory');
  const data = await response.json();
  
  const myInstance = data.instances.find(i => i.session_id === mySessionId);
  
  if (myInstance.memory_usage.usage_percent > 80) {
    console.warn('Memory usage high! Consider pausing crawl.');
  }
}

// Monitor every 30 seconds
setInterval(monitorMemory, 30000);

3. Identify Memory Leaks

Compare memory profiles over time to detect leaks:

const profile1 = await fetch('/api/debug/memory/profile').then(r => r.json());

// Wait 5 minutes while crawling...
await new Promise(r => setTimeout(r, 300000));

const profile2 = await fetch('/api/debug/memory/profile').then(r => r.json());

// Compare growth rates
const growth = profile2.total_memory_mb - profile1.total_memory_mb;
const urlsAdded = profile2.stats.crawled - profile1.stats.crawled;
const memoryPerUrl = growth / urlsAdded;

console.log(`Memory per URL: ${memoryPerUrl.toFixed(2)} MB`);
// Expected: 0.1-0.5 MB per URL depending on page complexity

4. Optimize Based on Profiles

Use profiling data to adjust settings:

If This Component is High... Try This Optimization
url_queue > 15% Reduce maxDepth or maxUrls
crawled_urls > 75% Export fewer fields, disable images/analytics collection
link_graph > 20% Site has high internal linking. Consider disabling detailed link tracking.
issues > 10% Many issues detected. Use exclusion patterns to filter false positives.

5. Session Cleanup

LibreCrawl automatically removes inactive instances after 1 hour, but you can manually identify stale sessions:

const response = await fetch('/api/debug/memory');
const data = await response.json();

const now = new Date();
const stale = data.instances.filter(instance => {
  const lastActivity = new Date(instance.last_activity);
  const minutesInactive = (now - lastActivity) / 60000;
  return minutesInactive > 30 && instance.status === 'completed';
});

console.log(`Found ${stale.length} stale sessions ready for cleanup`);

Debug Dashboard

LibreCrawl includes a built-in debug dashboard accessible via web browser:

# Web UI (requires login)
http://localhost:5000/debug/memory

The dashboard provides:

  • Real-time memory graphs for all instances
  • Visual breakdown charts (pie/bar)
  • Auto-refresh every 5 seconds
  • Session details and statistics
  • Color-coded warnings for high memory usage

Access: The debug dashboard requires authentication. Navigate to /debug/memory after logging into LibreCrawl.

Example: Complete Monitoring Script

// Complete monitoring script for large crawls
class CrawlMonitor {
  constructor(baseUrl = 'http://localhost:5000') {
    this.baseUrl = baseUrl;
    this.sessionId = null;
    this.memoryHistory = [];
  }

  async getMemoryProfile() {
    const response = await fetch(`${this.baseUrl}/api/debug/memory/profile`);
    return response.json();
  }

  async monitorInstance(sessionId) {
    this.sessionId = sessionId;
    const profile = await this.getMemoryProfile();
    const instance = profile.profiles.find(p => p.session_id === sessionId);
    
    if (!instance) {
      console.error('Instance not found');
      return;
    }

    // Track memory history
    this.memoryHistory.push({
      timestamp: new Date(),
      memory_mb: instance.total_memory_mb,
      crawled: instance.breakdown.crawled_urls.item_count
    });

    // Analyze trends
    if (this.memoryHistory.length >= 2) {
      const prev = this.memoryHistory[this.memoryHistory.length - 2];
      const curr = this.memoryHistory[this.memoryHistory.length - 1];
      
      const memGrowth = curr.memory_mb - prev.memory_mb;
      const urlsAdded = curr.crawled - prev.crawled;
      const memPerUrl = urlsAdded > 0 ? memGrowth / urlsAdded : 0;
      
      console.log(`Memory/URL: ${memPerUrl.toFixed(2)} MB`);
      
      if (memPerUrl > 1.0) {
        console.warn('⚠️ High memory per URL! Check page complexity.');
      }
    }

    // Display current state
    console.log(`Total Memory: ${instance.total_memory_mb.toFixed(1)} MB`);
    console.log('Breakdown:');
    Object.entries(instance.breakdown).forEach(([component, data]) => {
      console.log(` ${component}: ${data.size_mb.toFixed(1)} MB (${data.percent.toFixed(1)}%)`);
    });

    // Display suggestions
    if (instance.optimization_suggestions.length > 0) {
      console.log('\nSuggestions:');
      instance.optimization_suggestions.forEach(s => console.log(` 💡 ${s}`));
    }
  }
}

// Usage
const monitor = new CrawlMonitor();
setInterval(() => monitor.monitorInstance(mySessionId), 30000);

Next Steps