← GDG /

#216 gdtest_long_names

#216 gdtest_long_names OK CONFIG
Long object names for sidebar wrapping tests
Long object names to test sidebar smart line-breaking. Classes like DuckDBDocumentStore, PostgreSQLDocumentStore have methods such as retrieve_by_similarity() and retrieve_hybrid_combination(). Also includes plain-text names: all-lowercase, all-uppercase, and initial-cap (e.g. documentstorewithvectorsearchcapabilities). Section titles include 'DuckDBDocumentStore Methods'. Key test: sidebar items wrap at dots, underscores, and camelCase boundaries instead of being truncated with ellipsis.
View Site → Build Log ๐Ÿงช Test Coverage

Build Mode

● Has great-docs.yml

This package ships a pre-supplied config. The great-docs init step is skipped and great-docs build uses the spec-defined configuration directly. Tests specific config options and their rendered output.

Dimensions

A1 B1 C3 D1 E6 F6 G1 H8
A1Flat layoutlayout
B1Explicit __all__exports
C3Big class (>5)objects
D1NumPydocstrings
E6No directivesdirectives
F6No user guideuser_guide
G1README.mdlanding
H8H8

Source Files

๐Ÿ“ gdtest_long_names/
๐Ÿ“„ __init__.py
"""Package with deliberately long object names."""

__version__ = "0.1.0"

from gdtest_long_names.store import (
    BaseDocumentStore,
    DuckDBDocumentStore,
    PostgreSQLDocumentStore,
)
from gdtest_long_names.embedding import (
    EmbeddingProvider,
    OpenAIEmbeddingProvider,
    CohereEmbeddingProvider,
)
from gdtest_long_names.chunker import (
    BaseChunkerStrategy,
    MarkdownChunkerStrategy,
)
from gdtest_long_names.types import (
    RetrievedDocumentChunk,
    DocumentMetadataConfig,
    EmbeddingVectorResult,
)
from gdtest_long_names.plaintext import (
    documentstorewithvectorsearchcapabilities,
    EMBEDDINGPROVIDERWITHBATCHPROCESSINGSUPPORT,
    Chunkerstrategywithoverlapdetection,
)

__all__ = [
    "BaseDocumentStore",
    "DuckDBDocumentStore",
    "PostgreSQLDocumentStore",
    "EmbeddingProvider",
    "OpenAIEmbeddingProvider",
    "CohereEmbeddingProvider",
    "BaseChunkerStrategy",
    "MarkdownChunkerStrategy",
    "RetrievedDocumentChunk",
    "DocumentMetadataConfig",
    "EmbeddingVectorResult",
    "documentstorewithvectorsearchcapabilities",
    "EMBEDDINGPROVIDERWITHBATCHPROCESSINGSUPPORT",
    "Chunkerstrategywithoverlapdetection",
]
๐Ÿ“„ chunker.py
"""Chunker strategy implementations."""


class BaseChunkerStrategy:
    """
    Abstract base class for document chunking strategies.

    Parameters
    ----------
    max_chunk_size
        Maximum size of each chunk in characters.
    overlap_size
        Number of overlapping characters between chunks.
    """

    def __init__(self, max_chunk_size: int = 1000, overlap_size: int = 200):
        self.max_chunk_size = max_chunk_size
        self.overlap_size = overlap_size

    def chunk_document_content(self, content: str) -> list:
        """Split document content into chunks."""
        return []

    def calculate_optimal_boundaries(self, content: str) -> list:
        """Find optimal chunk boundary positions."""
        return []


class MarkdownChunkerStrategy(BaseChunkerStrategy):
    """
    Markdown-aware chunking strategy that respects heading boundaries.

    Parameters
    ----------
    max_chunk_size
        Maximum size of each chunk in characters.
    overlap_size
        Number of overlapping characters between chunks.
    preserve_code_blocks
        Whether to keep code blocks intact.
    """

    def __init__(self, max_chunk_size: int = 1000, overlap_size: int = 200, preserve_code_blocks: bool = True):
        super().__init__(max_chunk_size, overlap_size)
        self.preserve_code_blocks = preserve_code_blocks

    def split_by_heading_hierarchy(self, content: str) -> list:
        """Split content by markdown heading hierarchy."""
        return []

    def merge_undersized_fragments(self, chunks: list) -> list:
        """Merge chunks that are too small to stand alone."""
        return []
๐Ÿ“„ embedding.py
"""Embedding provider implementations."""


class EmbeddingProvider:
    """
    Base class for embedding providers.

    Parameters
    ----------
    model_name
        Name of the embedding model.
    """

    def __init__(self, model_name: str):
        self.model_name = model_name

    def generate_embeddings(self, texts: list) -> list:
        """Generate embeddings for a list of texts."""
        return []


class OpenAIEmbeddingProvider(EmbeddingProvider):
    """
    OpenAI embedding provider using text-embedding models.

    Parameters
    ----------
    model_name
        Name of the OpenAI model.
    api_key
        OpenAI API key.
    """

    def __init__(self, model_name: str = "text-embedding-3-small", api_key: str = ""):
        super().__init__(model_name)
        self.api_key = api_key

    def generate_embeddings_batch(self, texts: list, batch_size: int = 100) -> list:
        """Generate embeddings in batches to handle rate limits."""
        return []

    def calculate_token_usage(self, texts: list) -> int:
        """Calculate total token usage for a list of texts."""
        return 0


class CohereEmbeddingProvider(EmbeddingProvider):
    """
    Cohere embedding provider with input type support.

    Parameters
    ----------
    model_name
        Name of the Cohere model.
    input_type
        Type of input for embedding.
    """

    def __init__(self, model_name: str = "embed-english-v3.0", input_type: str = "search_document"):
        super().__init__(model_name)
        self.input_type = input_type

    def generate_with_input_type(self, texts: list, input_type: str) -> list:
        """Generate embeddings with specific input type."""
        return []

    def get_supported_languages(self) -> list:
        """Return list of supported languages."""
        return []
๐Ÿ“„ plaintext.py
"""Classes with long plain-text names (no special characters)."""


class documentstorewithvectorsearchcapabilities:
    """
    A store for documents supporting vector search.

    This class name is entirely lowercase with no separators,
    underscores, dots, or camelCase transitions.

    Parameters
    ----------
    connectionstring
        Database connection string.
    vectordimension
        Dimensionality of stored vectors.
    """

    def __init__(self, connectionstring: str, vectordimension: int = 1536):
        self.connectionstring = connectionstring
        self.vectordimension = vectordimension

    def insertdocumentswithembeddings(self, docs: list) -> int:
        """Insert documents along with their embedding vectors."""
        return 0

    def searchbyvectorsimilarity(self, query: str, topk: int = 10) -> list:
        """Search for documents by vector similarity."""
        return []

    def rebuildvectorsearchindex(self) -> None:
        """Rebuild the internal vector search index."""
        pass

    def deletedocumentsbyidentifier(self, docid: str) -> bool:
        """Delete a document by its unique identifier."""
        return False

    def countdocumentsincollection(self) -> int:
        """Return the total number of documents stored."""
        return 0

    def exportcollectiontojsonlines(self, filepath: str) -> int:
        """Export all documents to a JSON Lines file."""
        return 0


class EMBEDDINGPROVIDERWITHBATCHPROCESSINGSUPPORT:
    """
    All-uppercase embedding provider class.

    This class name is entirely uppercase with no separators,
    underscores, dots, or camelCase transitions.

    Parameters
    ----------
    MODELIDENTIFIER
        Identifier for the embedding model.
    BATCHLIMIT
        Maximum batch size for processing.
    """

    def __init__(self, MODELIDENTIFIER: str, BATCHLIMIT: int = 100):
        self.MODELIDENTIFIER = MODELIDENTIFIER
        self.BATCHLIMIT = BATCHLIMIT

    def GENERATEEMBEDDINGSFROMTEXTINPUT(self, texts: list) -> list:
        """Generate embeddings from a list of text inputs."""
        return []

    def CALCULATETOKENCOUNTFORTEXTS(self, texts: list) -> int:
        """Calculate total token count for the given texts."""
        return 0

    def RETRIEVEMODELCONFIGURATION(self) -> dict:
        """Retrieve the current model configuration."""
        return {}

    def VALIDATEINPUTTEXTLENGTHS(self, texts: list) -> bool:
        """Validate that all input texts are within length limits."""
        return True

    def EXPORTEMBEDDINGSTOFILE(self, filepath: str) -> int:
        """Export computed embeddings to a file."""
        return 0

    def RESETINTERNALBATCHCOUNTER(self) -> None:
        """Reset the internal batch processing counter."""
        pass


class Chunkerstrategywithoverlapdetection:
    """
    Initial-cap chunker strategy class.

    This class name starts with an uppercase letter and the rest
    is entirely lowercase, with no other separators.

    Parameters
    ----------
    maxchunksize
        Maximum size of each chunk in characters.
    overlapsize
        Number of overlapping characters between chunks.
    """

    def __init__(self, maxchunksize: int = 1000, overlapsize: int = 200):
        self.maxchunksize = maxchunksize
        self.overlapsize = overlapsize

    def splitcontentintochunks(self, content: str) -> list:
        """Split document content into overlapping chunks."""
        return []

    def detectoverlapboundaries(self, content: str) -> list:
        """Detect optimal overlap boundary positions."""
        return []

    def mergeundersizedfragments(self, chunks: list) -> list:
        """Merge fragments that are too small to stand alone."""
        return []

    def calculateoverlappercentage(self, chunks: list) -> float:
        """Calculate the average overlap percentage between chunks."""
        return 0.0

    def exportchunkswithoverlap(self, filepath: str) -> int:
        """Export chunks with overlap markers to a file."""
        return 0

    def resetinternalchunkcache(self) -> None:
        """Reset the internal chunk processing cache."""
        pass
๐Ÿ“„ store.py
"""Document store implementations."""


class BaseDocumentStore:
    """
    Abstract base class for document stores.

    Parameters
    ----------
    connection_string
        Database connection string.
    """

    def __init__(self, connection_string: str):
        self.connection_string = connection_string

    def connect_to_database(self) -> None:
        """Establish connection to the underlying database."""
        pass

    def create_collection(self, name: str) -> None:
        """Create a new document collection."""
        pass


class DuckDBDocumentStore(BaseDocumentStore):
    """
    DuckDB-backed document store with vector search.

    Parameters
    ----------
    connection_string
        Database connection string.
    index_type
        Type of vector index to use.
    """

    def __init__(self, connection_string: str, index_type: str = "hnsw"):
        super().__init__(connection_string)
        self.index_type = index_type

    def upsert_documents(self, docs: list) -> int:
        """Insert or update documents in the store."""
        return 0

    def ingest_from_directory(self, path: str) -> int:
        """Ingest all documents from a directory."""
        return 0

    def retrieve_by_similarity(self, query: str, top_k: int = 10) -> list:
        """Retrieve documents by vector similarity search."""
        return []

    def retrieve_by_bm25_score(self, query: str, top_k: int = 10) -> list:
        """Retrieve documents using BM25 text scoring."""
        return []

    def retrieve_hybrid_combination(self, query: str, top_k: int = 10) -> list:
        """Retrieve using hybrid vector + BM25 combination."""
        return []

    def build_vector_index(self) -> None:
        """Build or rebuild the vector similarity index."""
        pass

    def get_collection_size(self) -> int:
        """Return the number of documents in the store."""
        return 0


class PostgreSQLDocumentStore(BaseDocumentStore):
    """
    PostgreSQL-backed document store with pgvector.

    Parameters
    ----------
    connection_string
        Database connection string.
    embedding_dimension
        Dimensionality of embedding vectors.
    """

    def __init__(self, connection_string: str, embedding_dimension: int = 1536):
        super().__init__(connection_string)
        self.embedding_dimension = embedding_dimension

    def upsert_with_embeddings(self, docs: list, embeddings: list) -> int:
        """Insert or update documents with precomputed embeddings."""
        return 0

    def retrieve_nearest_neighbors(self, embedding: list, top_k: int = 10) -> list:
        """Retrieve documents using nearest neighbor search."""
        return []

    def create_ivfflat_index(self, num_lists: int = 100) -> None:
        """Create an IVFFlat index for approximate search."""
        pass

    def vacuum_analyze_table(self) -> None:
        """Run VACUUM ANALYZE on the document table."""
        pass
๐Ÿ“„ types.py
"""Type definitions and data containers."""

from dataclasses import dataclass


@dataclass
class RetrievedDocumentChunk:
    """
    A document chunk returned from a retrieval query.

    Parameters
    ----------
    content
        The text content of the chunk.
    similarity_score
        Cosine similarity score (0 to 1).
    document_id
        Identifier of the source document.
    """

    content: str
    similarity_score: float
    document_id: str


@dataclass
class DocumentMetadataConfig:
    """
    Configuration for document metadata extraction.

    Parameters
    ----------
    extract_title
        Whether to extract document titles.
    extract_author
        Whether to extract author information.
    custom_metadata_fields
        Additional metadata fields to extract.
    """

    extract_title: bool = True
    extract_author: bool = True
    custom_metadata_fields: list = None

    def __post_init__(self):
        if self.custom_metadata_fields is None:
            self.custom_metadata_fields = []


@dataclass
class EmbeddingVectorResult:
    """
    Result container for embedding vector operations.

    Parameters
    ----------
    vectors
        List of embedding vectors.
    model_name
        Name of the model used.
    token_count
        Total tokens processed.
    """

    vectors: list
    model_name: str
    token_count: int
๐Ÿ“„ great-docs.yml
reference:
  sections:
    - title: Document Stores
      desc: Backend storage systems for documents and embeddings.
      contents:
        - BaseDocumentStore
        - DuckDBDocumentStore
        - PostgreSQLDocumentStore
    - title: Embedding Providers
      desc: Services for generating vector embeddings.
      contents:
        - EmbeddingProvider
        - OpenAIEmbeddingProvider
        - CohereEmbeddingProvider
    - title: Chunker Strategies
      desc: Strategies for splitting documents into chunks.
      contents:
        - BaseChunkerStrategy
        - MarkdownChunkerStrategy
    - title: Data Types
      desc: Type definitions and result containers.
      contents:
        - RetrievedDocumentChunk
        - DocumentMetadataConfig
        - EmbeddingVectorResult
    - title: Plain Text Names
      desc: Classes with long names containing no special characters.
      contents:
        - documentstorewithvectorsearchcapabilities
        - EMBEDDINGPROVIDERWITHBATCHPROCESSINGSUPPORT
        - Chunkerstrategywithoverlapdetection
sidebar_filter:
  enabled: true
  min_items: 1