Files
kte/PieceTable.h
Kyle Isom 8ec0d6ac41 Add benchmarks, migration tests, and dev guide
Add benchmarks for core operations, migration edge case tests, improved
buffer I/O tests, and developer guide

- Introduced `test_benchmarks.cc` for performance benchmarking of key
  operations in `PieceTable` and `Buffer`, including syntax highlighting
  and iteration patterns.
- Added `test_migration_coverage.cc` to provide comprehensive tests for
  migration of `Buffer::Rows()` to `PieceTable` APIs, with edge cases,
  boundary handling, and consistency checks.
- Enhanced `test_buffer_io.cc` with additional cases for save/load
  workflows, file handling, and better integration with the core API.
- Documented architectural details and core concepts in a new
  `DEVELOPER_GUIDE.md`. Highlighted design principles, code
  organization, and contribution workflows.
2026-02-17 16:08:23 -08:00

221 lines
6.7 KiB
C++

/*
* PieceTable.h - Alternative to GapBuffer using a piece table representation
*
* PieceTable is kte's core text storage data structure. It provides efficient
* insert/delete operations without copying the entire buffer by maintaining a
* sequence of "pieces" that reference ranges in two underlying buffers:
* - original_: Initial file content (currently unused, reserved for future)
* - add_: All text added during editing
*
* Key advantages:
* - O(1) append/prepend operations (common case)
* - O(n) insert/delete at arbitrary positions (n = number of pieces, not bytes)
* - Efficient undo: just restore the piece list
* - Memory efficient: no gap buffer waste
*
* Performance characteristics:
* - Piece count grows with edit operations; automatic consolidation prevents unbounded growth
* - Materialization (Data() call) is O(total_size) but cached until next edit
* - Line index is lazily rebuilt on first line-based query after edits
* - Range and Find operations use lightweight caches for repeated queries
*
* API evolution:
* 1. Legacy API (GapBuffer compatibility):
* - Append/Prepend: Build content sequentially
* - Data(): Materialize entire buffer
*
* 2. New buffer-wide API (Phase 1):
* - Insert/Delete: Edit at arbitrary byte offsets
* - Line-based queries: LineCount, GetLine, GetLineRange
* - Position conversion: ByteOffsetToLineCol, LineColToByteOffset
* - Efficient extraction: GetRange, Find, WriteToStream
*
* Implementation notes:
* - Consolidation heuristics prevent piece fragmentation (configurable via SetConsolidationParams)
* - Thread-safe for concurrent reads (mutex protects caches and lazy rebuilds)
* - Version tracking invalidates caches on mutations
*/
#pragma once
#include <cstddef>
#include <cstdint>
#include <string>
#include <ostream>
#include <vector>
#include <limits>
#include <mutex>
class PieceTable {
public:
PieceTable();
explicit PieceTable(std::size_t initialCapacity);
// Advanced constructor allowing configuration of consolidation heuristics
PieceTable(std::size_t initialCapacity,
std::size_t piece_limit,
std::size_t small_piece_threshold,
std::size_t max_consolidation_bytes);
PieceTable(const PieceTable &other);
PieceTable &operator=(const PieceTable &other);
PieceTable(PieceTable &&other) noexcept;
PieceTable &operator=(PieceTable &&other) noexcept;
~PieceTable();
// Public API mirrors GapBuffer
void Reserve(std::size_t newCapacity);
void AppendChar(char c);
void Append(const char *s, std::size_t len);
void Append(const PieceTable &other);
void PrependChar(char c);
void Prepend(const char *s, std::size_t len);
void Prepend(const PieceTable &other);
// Content management
void Clear();
// Accessors
char *Data()
{
materialize();
return materialized_.empty() ? nullptr : materialized_.data();
}
[[nodiscard]] const char *Data() const
{
const_cast<PieceTable *>(this)->materialize();
return materialized_.empty() ? nullptr : materialized_.data();
}
[[nodiscard]] std::size_t Size() const
{
return total_size_;
}
[[nodiscard]] std::size_t Capacity() const
{
// Capacity for piece table isn't directly meaningful; report materialized capacity
return materialized_.capacity();
}
// ===== New buffer-wide API (Phase 1) =====
// Byte-based editing operations
void Insert(std::size_t byte_offset, const char *text, std::size_t len);
void Delete(std::size_t byte_offset, std::size_t len);
// Line-based queries
[[nodiscard]] std::size_t LineCount() const; // number of logical lines
[[nodiscard]] std::string GetLine(std::size_t line_num) const;
[[nodiscard]] std::pair<std::size_t, std::size_t> GetLineRange(std::size_t line_num) const; // [start,end)
// Position conversion
[[nodiscard]] std::pair<std::size_t, std::size_t> ByteOffsetToLineCol(std::size_t byte_offset) const;
[[nodiscard]] std::size_t LineColToByteOffset(std::size_t row, std::size_t col) const;
// Substring extraction
[[nodiscard]] std::string GetRange(std::size_t byte_offset, std::size_t len) const;
// Simple search utility; returns byte offset or npos
[[nodiscard]] std::size_t Find(const std::string &needle, std::size_t start = 0) const;
// Stream out content without materializing the entire buffer
void WriteToStream(std::ostream &out) const;
// Heuristic configuration
void SetConsolidationParams(std::size_t piece_limit,
std::size_t small_piece_threshold,
std::size_t max_consolidation_bytes);
private:
enum class Source : unsigned char { Original, Add };
struct Piece {
Source src;
std::size_t start;
std::size_t len;
};
void addPieceBack(Source src, std::size_t start, std::size_t len);
void addPieceFront(Source src, std::size_t start, std::size_t len);
void materialize() const;
// Helper: locate piece index and inner offset for a global byte offset
[[nodiscard]] std::pair<std::size_t, std::size_t> locate(std::size_t byte_offset) const;
// Helper: try to coalesce neighboring pieces around index
void coalesceNeighbors(std::size_t index);
// Consolidation helpers and heuristics
void maybeConsolidate();
void consolidateRange(std::size_t start_idx, std::size_t end_idx);
void appendPieceDataTo(std::string &out, const Piece &p) const;
// Line index support (rebuilt lazily on demand)
void InvalidateLineIndex() const;
void RebuildLineIndex() const;
// Underlying storages
std::string original_; // unused for builder use-case, but kept for API symmetry
std::string add_;
std::vector<Piece> pieces_;
mutable std::string materialized_;
mutable bool dirty_ = true;
// Monotonic content version. Increment on any mutation that affects content layout
mutable std::uint64_t version_ = 0;
std::size_t total_size_ = 0;
// Cached line index: starting byte offset of each line (always contains at least 1 entry: 0)
mutable std::vector<std::size_t> line_index_;
mutable bool line_index_dirty_ = true;
// Heuristic knobs
std::size_t piece_limit_ = 4096; // trigger consolidation when exceeded
std::size_t small_piece_threshold_ = 64; // bytes
std::size_t max_consolidation_bytes_ = 4096; // cap per consolidation run
// Lightweight caches to avoid redundant work when callers query the same range repeatedly
struct RangeCache {
bool valid = false;
std::uint64_t version = 0;
std::size_t off = 0;
std::size_t len = 0;
std::string data;
};
struct FindCache {
bool valid = false;
std::uint64_t version = 0;
std::string needle;
std::size_t start = 0;
std::size_t result = std::numeric_limits<std::size_t>::max();
};
mutable RangeCache range_cache_;
mutable FindCache find_cache_;
mutable std::mutex mutex_;
};