Document Loaders¶
Document Loaders are the foundation of ExtractThinker's document processing pipeline. They handle the initial loading and preprocessing of documents, converting them into a standardized format that can be used by other components.

Basic Concept¶
A Document Loader can return content in two formats: - A simple string containing the extracted text - A structured object with pages and their content, that depends on the loader
Base Document Loader
The base DocumentLoader class defines the interface and common functionality that all loaders must implement:
- load_content_from_file
: Process files from disk
- load_content_from_stream
: Process BytesIO streams
- can_handle
: Validate file types
- convert_to_images
: Convert documents to images
from abc import ABC, abstractmethod
import io
from io import BytesIO
from PIL import Image
import pypdfium2 as pdfium
from typing import Any, Dict, Union, List
from cachetools import TTLCache
import os
import magic
from extract_thinker.utils import get_file_extension, check_mime_type
from playwright.sync_api import sync_playwright
from urllib.parse import urlparse
import base64
import math
class DocumentLoader(ABC):
# SUPPORTED_FORMATS = [
# "pdf", "jpg", "jpeg", "png", "tiff", "bmp"
# ]
def __init__(self, content: Any = None, cache_ttl: int = 300, screenshot_timeout: int = 1000):
"""Initialize loader.
Args:
content: Initial content
cache_ttl: Cache time-to-live in seconds
screenshot_timeout: Timeout in milliseconds to wait for page content load when capturing a screenshot.
"""
self.content = content
self.file_path = None
self.cache = TTLCache(maxsize=100, ttl=cache_ttl)
self.vision_mode = False
self.max_image_size = None # Changed to None by default
self.is_url = False # Indicates if the source is a URL
self.screenshot_timeout = screenshot_timeout
def set_max_image_size(self, size: int) -> None:
"""Set the maximum image size."""
self.max_image_size = size
def set_vision_mode(self, enabled: bool = True) -> None:
"""Enable or disable vision mode processing."""
self.vision_mode = enabled
def set_screenshot_timeout(self, timeout: int) -> None:
"""Set the screenshot timeout in milliseconds for capturing a screenshot from a URL."""
self.screenshot_timeout = timeout
def can_handle(self, source: Union[str, BytesIO]) -> bool:
"""
Checks if the loader can handle the given source.
Args:
source: Either a file path (str) or a BytesIO stream
Returns:
bool: True if the loader can handle the source, False otherwise
"""
try:
if isinstance(source, str):
return self._can_handle_file_path(source)
elif isinstance(source, BytesIO):
return self._can_handle_stream(source)
return False
except Exception:
return False
def _can_handle_file_path(self, file_path: str) -> bool:
"""Checks if the loader can handle the given file path."""
if not os.path.isfile(file_path):
return False
file_type = get_file_extension(file_path)
return file_type.lower() in [fmt.lower() for fmt in self.SUPPORTED_FORMATS]
def _can_handle_stream(self, stream: BytesIO) -> bool:
"""Checks if the loader can handle the given BytesIO stream."""
try:
mime = magic.from_buffer(stream.getvalue(), mime=True)
stream.seek(0) # Reset stream position
return check_mime_type(mime, self.SUPPORTED_FORMATS)
except Exception:
return False
@abstractmethod
def load(self, source: Union[str, BytesIO]) -> Any:
"""Enhanced load method that handles vision mode."""
pass
def getContent(self) -> Any:
return self.content
def convert_to_images(self, file: Union[str, io.BytesIO, io.BufferedReader], scale: float = 300 / 72) -> Dict[int, bytes]:
# Determine if the input is a file path or a stream
if isinstance(file, str):
return self._convert_file_to_images(file, scale)
elif isinstance(file, (io.BytesIO, io.BufferedReader)): # Accept both BytesIO and BufferedReader
return self._convert_stream_to_images(file, scale)
else:
raise TypeError("file must be a file path (str) or a file-like stream")
def _convert_file_to_images(self, file_path: str, scale: float) -> Dict[int, bytes]:
"""Convert file to images, handling both URLs and local files."""
# Check if it's a URL
if self._is_url(file_path):
self.is_url = True # Set the instance variable if the source is a URL
try:
screenshot = self._capture_screenshot_from_url(file_path)
# Convert screenshot to PIL Image for potential resizing
img = Image.open(BytesIO(screenshot))
img = self._resize_if_needed(img)
# Split into vertical chunks
chunks = self._split_image_vertically(img)
# Return dictionary with chunks as list
return {0: chunks} # All chunks from URL are considered "page 0"
except Exception as e:
raise ValueError(f"Failed to capture screenshot from URL: {str(e)}")
# Existing code for local files...
try:
Image.open(file_path)
is_image = True
except IOError:
is_image = False
if is_image:
with open(file_path, "rb") as f:
return {0: f.read()}
return self._convert_pdf_to_images(pdfium.PdfDocument(file_path), scale)
def _convert_stream_to_images(self, file_stream: io.BytesIO, scale: float) -> Dict[int, bytes]:
# Check if the stream is already an image
try:
Image.open(file_stream)
is_image = True
except IOError:
is_image = False
# Reset stream position
file_stream.seek(0)
if is_image:
# If it is, return it as is
return {0: file_stream.read()}
# If it's not an image, proceed with the conversion
return self._convert_pdf_to_images(pdfium.PdfDocument(file_stream), scale)
def _resize_if_needed(self, image: Image.Image) -> Image.Image:
"""Resize image if it exceeds maximum dimensions while maintaining aspect ratio.
Args:
image: PIL Image object
Returns:
PIL Image object (resized if necessary)
"""
if self.max_image_size is None: # Skip resizing if max_image_size not set
return image
width, height = image.size
if width > self.max_image_size or height > self.max_image_size:
# Calculate scaling factor to fit within max dimensions
scale = self.max_image_size / max(width, height)
new_width = int(width * scale)
new_height = int(height * scale)
return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
return image
def _convert_pdf_to_images(self, pdf_file, scale: float) -> Dict[int, bytes]:
# Get all pages at once
renderer = pdf_file.render(
pdfium.PdfBitmap.to_pil,
page_indices=list(range(len(pdf_file))),
scale=scale,
)
# Convert all images to bytes and store in dictionary
final_images = {}
for page_index, image in enumerate(renderer):
# Resize image if needed
image = self._resize_if_needed(image)
image_byte_array = BytesIO()
image.save(image_byte_array, format="jpeg", optimize=True)
final_images[page_index] = image_byte_array.getvalue()
return final_images
def can_handle_vision(self, source: Union[str, BytesIO]) -> bool:
"""
Checks if the loader can handle the source in vision mode.
Args:
source: Either a file path (str), URL, or a BytesIO stream
Returns:
bool: True if the loader can handle the source in vision mode
"""
try:
if isinstance(source, str):
if self._is_url(source):
return True # URLs are always supported in vision mode
ext = get_file_extension(source).lower()
return ext in ['pdf', 'jpg', 'jpeg', 'png', 'tiff', 'bmp']
elif isinstance(source, BytesIO):
try:
Image.open(source)
return True
except:
# Try to load as PDF
try:
pdfium.PdfDocument(source)
return True
except:
return False
return False
except Exception:
return False
def can_handle_paginate(self, source: Union[str, BytesIO]) -> bool:
"""
Checks if the source supports pagination (e.g., PDF, PPT).
Args:
source: Either a file path (str) or a BytesIO stream
Returns:
bool: True if the source supports pagination
"""
try:
if isinstance(source, str):
# For file paths, check the extension
ext = get_file_extension(source).lower()
else:
# For BytesIO streams, use magic to detect mime type
mime = magic.from_buffer(source.getvalue(), mime=True)
source.seek(0) # Reset stream position
return mime == 'application/pdf'
# List of extensions that support pagination
return ext in ['pdf']
except Exception:
return False
@staticmethod
def _check_playwright_dependencies():
"""
Check if the playwright dependency is installed.
Raises:
ImportError: If playwright is not installed.
"""
try:
from playwright.sync_api import sync_playwright
except ImportError:
raise ImportError(
"You are using vision with url. You need to install playwright."
"`pip install playwright` and run `playwright install`."
)
def _capture_screenshot_from_url(self, url: str) -> bytes:
"""
Captures a full-page screenshot of a URL using Playwright.
Args:
url: The URL to capture
Returns:
bytes: The screenshot image data
"""
# Optional: Check if playwright is installed before attempting to use it.
self._check_playwright_dependencies()
from playwright.sync_api import sync_playwright # Import after the dependency check
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
try:
# Navigate to URL
page.goto(url, wait_until='networkidle')
# Optional: Handle cookie consent popups (customize selectors as needed)
try:
page.click('button:has-text("Accept")', timeout=10000)
except Exception:
pass # Ignore if no cookie banner is found
# Wait for content to load with the configurable timeout
page.wait_for_timeout(self.screenshot_timeout)
# Capture full page screenshot
screenshot = page.screenshot(full_page=True)
return screenshot
finally:
browser.close()
def _split_image_vertically(self, img: Image.Image, chunk_height: int = 1000) -> List[bytes]:
"""
Splits a tall PIL Image into vertical chunks of `chunk_height`.
Returns a list of bytes in PNG format, in top-to-bottom order.
Args:
img: PIL Image to split
chunk_height: Height of each chunk in pixels
Returns:
List of PNG-encoded bytes for each chunk
"""
width, height = img.size
num_chunks = math.ceil(height / chunk_height)
chunks_bytes = []
for i in range(num_chunks):
top = i * chunk_height
bottom = min((i + 1) * chunk_height, height)
crop_box = (0, top, width, bottom)
# Crop the chunk
chunk_img = img.crop(crop_box)
# Convert chunk to bytes
chunk_bytes = io.BytesIO()
chunk_img.save(chunk_bytes, format="PNG", optimize=True)
chunk_bytes.seek(0)
chunks_bytes.append(chunk_bytes.read())
return chunks_bytes
def _is_url(self, source: str) -> bool:
"""Check if the source string is a URL."""
try:
result = urlparse(source)
return bool(result.scheme and result.netloc)
except:
return False
Core Features¶
Configuration Support¶
All Document Loaders support configuration-based initialization through dedicated config classes:
from extract_thinker import DocumentLoaderAWSTextract, TextractConfig
# Create configuration
config = TextractConfig(
aws_access_key_id="your_key",
feature_types=["TABLES", "FORMS"],
cache_ttl=600
)
# Initialize with configuration
loader = DocumentLoaderAWSTextract(config)
Caching¶
All Document Loaders include built-in caching capabilities through the CachedDocumentLoader
base class. This provides automatic caching of document processing results with a configurable TTL:
Cached Document Loader
The CachedDocumentLoader extends the base loader with caching capabilities:
from io import BytesIO
from typing import Any, Union
from cachetools import TTLCache
from extract_thinker.document_loader.document_loader import DocumentLoader
class CachedDocumentLoader(DocumentLoader):
def __init__(self, content: Any = None, cache_ttl: int = 300):
super().__init__(content)
self.cache = TTLCache(maxsize=100, ttl=cache_ttl)
def load(self, source: Union[str, BytesIO]) -> Any:
"""
Load content from source with caching support.
Args:
source: Either a file path (str) or a BytesIO stream
Returns:
The loaded content
"""
# Use the source and vision_mode state as the cache key
if isinstance(source, str):
cache_key = (source, self.vision_mode)
else:
# For BytesIO, use the content and vision_mode state as the cache key
cache_key = (source.getvalue(), self.vision_mode)
if cache_key in self.cache:
return self.cache[cache_key]
result = super().load(source)
self.cache[cache_key] = result
return result
Example usage of caching:
from extract_thinker.document_loader import DocumentLoader
class MyCustomLoader(DocumentLoader):
def __init__(self, content: Any = None, cache_ttl: int = 300):
super().__init__(content, cache_ttl) # 300 seconds default TTL
File Type Support¶
Document Loaders automatically validate file types through the can_handle
method:
loader = MyCustomLoader()
if loader.can_handle("document.pdf"):
content = loader.load("document.pdf")
Multiple Input Types¶
Loaders support both file paths and BytesIO streams:
# Load from file
content = loader.load("document.pdf")
# Load from stream
with open("document.pdf", "rb") as f:
stream = BytesIO(f.read())
content = loader.load(stream)
Vision Mode Support¶
Many loaders support vision mode for handling images and visual content:
# Enable vision mode
loader.set_vision_mode(True)
# Load document with images
pages = loader.load("document.pdf")
for page in pages:
text = page["content"]
image = page.get("image") # Available in vision mode
Image Resizing¶
Image Conversion¶
The base loader includes utilities for converting documents to images:
loader = DocumentLoader()
images = loader.convert_to_images(
"document.pdf",
scale=300/72 # DPI scaling
)
Common Methods¶
All Document Loaders implement these core methods:
load(source)
: Main entry point for loading documentsset_vision_mode(enabled)
: Enable/disable vision modeset_max_image_size(size)
: Set the maximum image size
Best Practices¶
- Use configuration classes for complex initialization
- Set appropriate cache TTL based on your use case
- Check file type support before processing
- Consider memory usage when processing large files
- Enable vision mode only when needed
- Handle both file paths and streams for flexibility
Available Loaders¶
ExtractThinker provides several specialized Document Loaders:
Cloud Services¶
- AWS Textract: AWS document processing with support for text, tables, forms, and layout analysis
- Azure Form: Azure's Document Intelligence with multiple model support
- Google Document AI: Google's document understanding with native PDF parsing
Local Processing¶
- PDF Plumber: Advanced PDF text and table extraction
- PyPDF: Basic PDF processing with password protection support
- Tesseract: Open-source OCR with multiple language support
- Doc2txt: Microsoft Word document processing
- Spreadsheet: Excel and CSV handling
- Text File: Plain text file handling with encoding support
- Markitdown: Multi-format document processing
- Docling: Advanced document layout and table analysis
Special Purpose¶
- Web Loader: Web page extraction with custom element handling
- LLM Image: Vision-enabled LLM processing
- Data: Pre-processed data handling with standardized format support
Coming Soon¶
Adobe PDF Services
Coming Soon: Adobe's PDF extraction and analysisABBYY FineReader
Coming Soon: Enterprise-grade OCR solutionPaddleOCR
Coming Soon: High-performance multilingual OCRUnstructured
Coming Soon: Open-source document preprocessingMathpix
Coming Soon: Math and scientific document processingEasyOCR
Coming Soon: Ready-to-use OCR with multilingual supportNanonets
Coming Soon: API-based document processingMindee
Coming Soon: Specialized document parsing APIsRossum
Coming Soon: AI-powered document understandingKofax
Coming Soon: Intelligent document processing