Transaction-based document editing for LLM workflows
LLM-generated document edits are unreliable. An edit might fail validation, place content in the wrong section, or include duplicate headers. When you’re applying five edits and the third one fails, what happens to the first two?
This pattern provides transaction semantics for document editing—atomic commit/rollback, pre-commit verification, and automatic cleanup of LLM artifacts.
The problem
Direct mutation leaves documents in invalid states:
# UNSAFE: What if edit three fails?
document_model = DocumentModel.from_dict(state["document_model"])
for edit in planned_edits:
section = document_model.get_section(edit["target"])
section.blocks.append(new_block) # Direct mutation
# Now document_model is partially edited
# Cannot recover original stateAdditional issues with LLM-generated edits:
- LLMs generate headers even when told not to (duplicate headers)
- Content placed in wrong sections (body text in References)
- Nested block references change during editing
- No verification before changes become permanent
Transaction context manager
The solution uses deep copy to isolate edits until commit:
from contextlib import contextmanager
import copy
class DocumentTransaction:
"""Transaction wrapper for safe document edits."""
def __init__(self, original_model):
self._original_model = original_model
# Create isolated working copy
self._working_copy = DocumentModel.from_dict(
copy.deepcopy(original_model.to_dict())
)
self._committed = False
self._rolled_back = False
def insert_block_at_end(self, section_id: str, block) -> bool:
"""Append block to section (on working copy)."""
if self._committed or self._rolled_back:
return False
section = self._working_copy.get_section(section_id)
if section:
section.blocks.append(block)
return True
return False
def verify(self) -> dict:
"""Validate document integrity before commit."""
issues = []
# Check for content in References section
for section in self._working_copy.sections:
if section.heading.lower() in ("references", "bibliography"):
if any(b.content.strip() for b in section.blocks):
issues.append(f"Content in {section.heading}")
# Check for empty sections
for section in self._working_copy.get_leaf_sections():
if not section.blocks:
issues.append(f"Empty section: {section.heading}")
return {"valid": len(issues) == 0, "issues": issues}
def commit(self):
"""Apply all changes to original model."""
self._original_model.sections = self._working_copy.sections
self._original_model.preamble_blocks = self._working_copy.preamble_blocks
self._committed = True
return self._original_model
def rollback(self):
"""Discard all changes."""
self._working_copy = None
self._rolled_back = TrueWire it as a context manager:
class DocumentModel:
@contextmanager
def transaction(self):
"""Create transaction for safe edits."""
txn = DocumentTransaction(self)
try:
yield txn
if not txn._rolled_back:
txn.commit()
except Exception:
txn.rollback()
raiseUsage:
with document_model.transaction() as txn:
txn.insert_block_at_end("sec_results", new_block)
txn.insert_section_after("sec_methodology", synthesis_section)
verification = txn.verify()
if not verification["valid"]:
txn.rollback()
# Original unchanged
# Auto-commits on successful exitDuplicate header stripping
LLMs often regenerate section headers even when instructed not to:
Prompt: "Write content for the Results section. Don't include a header."
LLM output: "## Results\n\nThe analysis shows..."
Strip duplicate headers during document reconstruction:
import re
def normalize_heading(text: str) -> str:
"""Normalize heading for comparison."""
text = text.lower()
# Strip "1.", "Chapter 1:", "Section 2.3"
text = re.sub(r'^(?:chapter|section)?\s*[\d.]+[.:)]*\s*', '', text)
return re.sub(r'[^a-z]', '', text)
def strip_leading_header(content: str, section_heading: str | None = None) -> str:
"""Strip leading markdown header if it matches section heading."""
header_match = re.match(r'^(#{1,6})\s+(.+?)(?:\n|$)', content.strip())
if not header_match:
return content
header_text = header_match.group(2).strip()
# Only strip if it matches the section heading
if section_heading and normalize_heading(header_text) == normalize_heading(section_heading):
return content.strip()[header_match.end():].lstrip('\n')
return contentIntegrate into document rendering:
def to_markdown(self) -> str:
"""Reconstruct markdown with header cleanup."""
lines = []
for section in self.sections:
lines.append(f"{'#' * section.level} {section.heading}")
lines.append("")
for block in section.blocks:
# Strip duplicate header from LLM content
content = strip_leading_header(block.content, section.heading)
if content.strip():
lines.append(content)
lines.append("")
return "\n".join(lines).strip()Hierarchical anchoring
Document edits need stable references to nested elements. Index-based paths break when structure changes:
# Index-based (fragile)
path = "/sections[1]/blocks[0]"
# After inserting section at index 0, this points to wrong content
# ID-based (stable)
anchor = "sec_results/blk_content_123"
# Still points to correct element after structural changesImplement hierarchical anchors:
class DocumentModel:
def get_anchor(self, element_id: str) -> str | None:
"""Get hierarchical path for any section or block.
Returns paths like:
- "sec_abc123" # Top-level section
- "sec_abc123/sec_def456" # Nested section
- "sec_abc123/blk_xyz789" # Block in section
- "__preamble__/blk_xyz789" # Preamble block
"""
for block in self.preamble_blocks:
if block.block_id == element_id:
return f"__preamble__/{element_id}"
for section in self.sections:
anchor = self._get_anchor_recursive(section, element_id)
if anchor:
return anchor
return None
def resolve_anchor(self, anchor: str):
"""Resolve anchor path to element."""
parts = anchor.split("/")
if parts[0] == "__preamble__":
return self.get_block(parts[1]) if len(parts) == 2 else None
current = self.get_section(parts[0])
for part in parts[1:]:
if part.startswith("blk_"):
for block in current.blocks:
if block.block_id == part:
return block
return None
for sub in current.subsections:
if sub.section_id == part:
current = sub
break
return currentComplete example
# Load document
document_model = DocumentModel.from_dict(state["document_model"])
# Transaction with multiple edits
with document_model.transaction() as txn:
# Add synthesis section after methodology
synthesis = Section.from_heading("Synthesis", level=2)
synthesis.blocks.append(ContentBlock.from_content(
"This section synthesizes findings...",
"paragraph"
))
txn.insert_section_after("sec_methodology", synthesis)
# Add content to existing section
additional = ContentBlock.from_content(
"Recent studies confirm these findings...",
"paragraph"
)
txn.insert_block_at_end("sec_discussion", additional)
# Verify before commit
verification = txn.verify()
if not verification["valid"]:
logger.warning(f"Issues: {verification['issues']}")
txn.rollback()
# Auto-commits on exit
# Get stable reference for later use
anchor = document_model.get_anchor("blk_content_xyz789")
# "sec_results/blk_content_xyz789"
# Render with LLM artifact cleanup
markdown = document_model.to_markdown()When to use this pattern
Use when:
- Applying multiple edits that should succeed or fail together
- Need to verify document validity before committing
- Working with LLM-generated content that may include artifacts
- Requiring stable references to nested content
- Parallel edit operations need isolation
Don’t use when:
- Single, simple edit that can’t fail
- Performance-critical path where deep copy is too expensive
- Document model is immutable by design
Trade-offs
Benefits:
- Atomic operations—all edits succeed or fail together
- Safe rollback—original preserved until explicit commit
- Pre-commit verification—detect invalid states early
- LLM artifact cleanup—automatic duplicate header removal
- Stable references—hierarchical anchors survive structure changes
Costs:
- Memory overhead—deep copy doubles memory during transaction
- Performance cost—copy and index rebuild add latency
- Complexity—more code than direct mutation