Debug & Monitoring API - LibreCrawl API Documentation

The Debug API provides specialized endpoints for monitoring memory usage, profiling performance, and diagnosing issues with crawler instances. These endpoints are essential for troubleshooting large-scale crawls and optimizing system resources.

Development Use: These endpoints are primarily intended for development, debugging, and monitoring. They expose internal system state and should be used with caution in production environments.

Endpoints

GET /api/debug/memory

Get memory usage statistics for all active crawler instances across all user sessions. Useful for monitoring system-wide resource consumption and identifying memory-heavy crawls.

Authentication

Requires valid session cookie.

Request Parameters

No request parameters required.

Example Request

            
curl http://localhost:5000/api/debug/memory \

  -b cookies.txt

Success Response (200 OK)

            
{

  "total_instances": 3,

  "instances": [

    {

      "session_id": "550e8400-e29b-41d4-a716-446655440000",

      "user_id": 1,

      "created_at": "2025-01-18T14:30:22Z",

      "last_activity": "2025-01-18T14:35:45Z",

      "status": "running",

      "base_url": "https://example.com",

      "memory_usage": {

        "current_mb": 245.8,

        "peak_mb": 312.4,

        "limit_mb": 2048,

        "usage_percent": 12.0

      },

      "stats": {

        "discovered": 1247,

        "crawled": 856,

        "depth": 4

      }

    },

    {

      "session_id": "660e8400-e29b-41d4-a716-446655440001",

      "user_id": 2,

      "created_at": "2025-01-18T14:25:10Z",

      "last_activity": "2025-01-18T14:35:50Z",

      "status": "paused",

      "base_url": "https://anothersite.com",

      "memory_usage": {

        "current_mb": 512.3,

        "peak_mb": 587.9,

        "limit_mb": 2048,

        "usage_percent": 25.0

      },

      "stats": {

        "discovered": 5420,

        "crawled": 3210,

        "depth": 7

      }

    },

    {

      "session_id": "770e8400-e29b-41d4-a716-446655440002",

      "user_id": null,

      "created_at": "2025-01-18T14:33:15Z",

      "last_activity": "2025-01-18T14:35:48Z",

      "status": "completed",

      "base_url": "https://smallsite.com",

      "memory_usage": {

        "current_mb": 78.4,

        "peak_mb": 95.2,

        "limit_mb": 2048,

        "usage_percent": 3.8

      },

      "stats": {

        "discovered": 42,

        "crawled": 42,

        "depth": 2

      }

    }

  ]

}

Response Fields

Field	Type	Description
total_instances	number	Total number of active crawler instances
instances	array	Array of crawler instance objects

Instance Object Fields

Field	Type	Description
session_id	string	Unique session identifier (UUID)
user_id	number\|null	User ID (null for guest users)
created_at	string	ISO 8601 timestamp when instance was created
last_activity	string	ISO 8601 timestamp of last activity
status	string	Current crawl status: "running", "paused", "completed", "idle"
base_url	string	Starting URL of the crawl
memory_usage	object	Memory consumption metrics
stats	object	Basic crawl statistics

Use Cases

System Monitoring: Track total memory consumption across all users
Capacity Planning: Identify how many concurrent crawls the system can handle
Resource Allocation: Find memory-intensive crawls that may need optimization
Session Cleanup: Identify stale sessions that should be cleaned up
User Activity: Monitor which users are actively crawling

GET /api/debug/memory/profile

Get detailed memory profiling data for all crawler instances, including breakdowns by component (URL queue, crawled data, link graph, etc.). Essential for identifying memory bottlenecks in large crawls.

Authentication

Requires valid session cookie.

Request Parameters

No request parameters required.

Example Request

            
curl http://localhost:5000/api/debug/memory/profile \

  -b cookies.txt

Success Response (200 OK)

            
{

  "total_instances": 2,

  "profiles": [

    {

      "session_id": "550e8400-e29b-41d4-a716-446655440000",

      "user_id": 1,

      "status": "running",

      "base_url": "https://example.com",

      "total_memory_mb": 245.8,

      "breakdown": {

        "url_queue": {

          "size_mb": 12.5,

          "percent": 5.1,

          "item_count": 391

        },

        "crawled_urls": {

          "size_mb": 168.4,

          "percent": 68.5,

          "item_count": 856

        },

        "link_graph": {

          "size_mb": 42.3,

          "percent": 17.2,

          "item_count": 3421

        },

        "issues": {

          "size_mb": 8.7,

          "percent": 3.5,

          "item_count": 124

        },

        "visited_set": {

          "size_mb": 9.8,

          "percent": 4.0,

          "item_count": 1247

        },

        "other": {

          "size_mb": 4.1,

          "percent": 1.7

        }

      },

      "optimization_suggestions": [

        "Consider reducing exportFields to decrease crawled_urls memory",

        "URL queue is healthy (<10%)"

      ]

    },

    {

      "session_id": "660e8400-e29b-41d4-a716-446655440001",

      "user_id": 2,

      "status": "paused",

      "base_url": "https://largesite.com",

      "total_memory_mb": 1024.5,

      "breakdown": {

        "url_queue": {

          "size_mb": 156.8,

          "percent": 15.3,

          "item_count": 12450

        },

        "crawled_urls": {

          "size_mb": 687.2,

          "percent": 67.1,

          "item_count": 45230

        },

        "link_graph": {

          "size_mb": 142.1,

          "percent": 13.9,

          "item_count": 156780

        },

        "issues": {

          "size_mb": 18.5,

          "percent": 1.8,

          "item_count": 892

        },

        "visited_set": {

          "size_mb": 12.4,

          "percent": 1.2,

          "item_count": 57680

        },

        "other": {

          "size_mb": 7.5,

          "percent": 0.7

        }

      },

      "optimization_suggestions": [

        "URL queue is large (>10%), consider reducing maxUrls or maxDepth",

        "High memory usage (50%+), monitor for potential issues",

        "Consider exporting data and restarting crawl to free memory"

      ]

    }

  ]

}

Memory Breakdown Components

Component	Description	Optimization Tips
url_queue	Pending URLs waiting to be crawled	Should be <10% of total. Reduce maxDepth/maxUrls if high.
crawled_urls	Data for all crawled pages (titles, meta, content, etc.)	Largest component (60-70%). Reduce exportFields to save memory.
link_graph	All discovered link relationships	Grows with internal linking density. Disable link tracking if not needed.
issues	Detected SEO and technical issues	Usually small. Use exclusion patterns to reduce false positives.
visited_set	Set of visited URLs (for deduplication)	Minimal overhead. Uses efficient hash sets.
other	Session state, settings, and overhead	Should be minimal (<5%).

Use Cases

Memory Optimization: Identify which components consume most memory
Troubleshooting: Diagnose memory leaks or unexpected growth
Capacity Planning: Estimate memory needs for large crawls
Configuration Tuning: Adjust settings based on memory profiles
Performance Analysis: Find bottlenecks in data structures

Pro Tip: Run this endpoint periodically during large crawls to monitor memory growth trends. If crawled_urls exceeds 80% and continues growing, consider exporting intermediate results.

Memory Monitoring Best Practices

1. Set Appropriate Memory Limits

Configure memoryLimit in settings to prevent out-of-memory crashes:

          
// For 8GB system: Set limit to 2048MB (25% of RAM)

// For 16GB system: Set limit to 4096MB (25% of RAM)

// For 32GB system: Set limit to 8192MB (25% of RAM)

await fetch('/api/save_settings', {

  method: 'POST',

  body: JSON.stringify({ memoryLimit: 2048 })

});

2. Monitor During Large Crawls

Poll the debug endpoints every 10-30 seconds during active crawls:

          
async function monitorMemory() {

  const response = await fetch('/api/debug/memory');

  const data = await response.json();

  const myInstance = data.instances.find(i => i.session_id === mySessionId);

  if (myInstance.memory_usage.usage_percent > 80) {

    console.warn('Memory usage high! Consider pausing crawl.');

  }

}

// Monitor every 30 seconds

setInterval(monitorMemory, 30000);

3. Identify Memory Leaks

Compare memory profiles over time to detect leaks:

          
const profile1 = await fetch('/api/debug/memory/profile').then(r => r.json());

// Wait 5 minutes while crawling...

await new Promise(r => setTimeout(r, 300000));

const profile2 = await fetch('/api/debug/memory/profile').then(r => r.json());

// Compare growth rates

const growth = profile2.total_memory_mb - profile1.total_memory_mb;

const urlsAdded = profile2.stats.crawled - profile1.stats.crawled;

const memoryPerUrl = growth / urlsAdded;

console.log(`Memory per URL: ${memoryPerUrl.toFixed(2)} MB`);

// Expected: 0.1-0.5 MB per URL depending on page complexity

4. Optimize Based on Profiles

Use profiling data to adjust settings:

If This Component is High...	Try This Optimization
url_queue > 15%	Reduce `maxDepth` or `maxUrls`
crawled_urls > 75%	Export fewer fields, disable images/analytics collection
link_graph > 20%	Site has high internal linking. Consider disabling detailed link tracking.
issues > 10%	Many issues detected. Use exclusion patterns to filter false positives.

5. Session Cleanup

LibreCrawl automatically removes inactive instances after 1 hour, but you can manually identify stale sessions:

          
const response = await fetch('/api/debug/memory');

const data = await response.json();

const now = new Date();

const stale = data.instances.filter(instance => {

  const lastActivity = new Date(instance.last_activity);

  const minutesInactive = (now - lastActivity) / 60000;

  return minutesInactive > 30 && instance.status === 'completed';

});

console.log(`Found ${stale.length} stale sessions ready for cleanup`);

Debug Dashboard

LibreCrawl includes a built-in debug dashboard accessible via web browser:

          
# Web UI (requires login)

http://localhost:5000/debug/memory

The dashboard provides:

Real-time memory graphs for all instances
Visual breakdown charts (pie/bar)
Auto-refresh every 5 seconds
Session details and statistics
Color-coded warnings for high memory usage

Access: The debug dashboard requires authentication. Navigate to /debug/memory after logging into LibreCrawl.

Example: Complete Monitoring Script

          
// Complete monitoring script for large crawls

class CrawlMonitor {

  constructor(baseUrl = 'http://localhost:5000') {

    this.baseUrl = baseUrl;

    this.sessionId = null;

    this.memoryHistory = [];

  }

  async getMemoryProfile() {

    const response = await fetch(`${this.baseUrl}/api/debug/memory/profile`);

    return response.json();

  }

  async monitorInstance(sessionId) {

    this.sessionId = sessionId;

    const profile = await this.getMemoryProfile();

    const instance = profile.profiles.find(p => p.session_id === sessionId);

    if (!instance) {

      console.error('Instance not found');

      return;

    }

    // Track memory history

    this.memoryHistory.push({

      timestamp: new Date(),

      memory_mb: instance.total_memory_mb,

      crawled: instance.breakdown.crawled_urls.item_count

    });

    // Analyze trends

    if (this.memoryHistory.length >= 2) {

      const prev = this.memoryHistory[this.memoryHistory.length - 2];

      const curr = this.memoryHistory[this.memoryHistory.length - 1];

      const memGrowth = curr.memory_mb - prev.memory_mb;

      const urlsAdded = curr.crawled - prev.crawled;

      const memPerUrl = urlsAdded > 0 ? memGrowth / urlsAdded : 0;

      console.log(`Memory/URL: ${memPerUrl.toFixed(2)} MB`);

      if (memPerUrl > 1.0) {

        console.warn('⚠️  High memory per URL! Check page complexity.');

      }

    }

    // Display current state

    console.log(`Total Memory: ${instance.total_memory_mb.toFixed(1)} MB`);

    console.log('Breakdown:');

    Object.entries(instance.breakdown).forEach(([component, data]) => {

      console.log(`  ${component}: ${data.size_mb.toFixed(1)} MB (${data.percent.toFixed(1)}%)`);

    });

    // Display suggestions

    if (instance.optimization_suggestions.length > 0) {

      console.log('\nSuggestions:');

      instance.optimization_suggestions.forEach(s => console.log(`  💡 ${s}`));

    }

  }

}

// Usage

const monitor = new CrawlMonitor();

setInterval(() => monitor.monitorInstance(mySessionId), 30000);

Next Steps

Settings API - Configure memory limits and optimization settings
Crawl Control API - Pause crawls when memory is high
Export API - Export data to free memory during long crawls
API Overview - Complete API reference