style: Format all Python files with ruff

- Formatted 103 files to comply with ruff format requirements
- No code logic changes, only formatting/whitespace
- Fixes CI formatting check failures
This commit is contained in:
yusyus
2026-02-08 14:42:27 +03:00
parent 6e4f623b9d
commit 0265de5816
103 changed files with 2241 additions and 2627 deletions

View File

@@ -75,10 +75,7 @@ class RAGChunker:
return len(text) // self.chars_per_token
def chunk_document(
self,
text: str,
metadata: dict,
source_file: str | None = None
self, text: str, metadata: dict, source_file: str | None = None
) -> list[dict]:
"""
Chunk single document into RAG-ready chunks.
@@ -125,11 +122,13 @@ class RAGChunker:
if source_file:
chunk_metadata["source_file"] = source_file
result.append({
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
"page_content": chunk_text.strip(),
"metadata": chunk_metadata
})
result.append(
{
"chunk_id": f"{metadata.get('source', 'unknown')}_{i}",
"page_content": chunk_text.strip(),
"metadata": chunk_metadata,
}
)
logger.info(
f"Created {len(result)} chunks from {source_file or 'document'} "
@@ -153,14 +152,10 @@ class RAGChunker:
# Chunk main SKILL.md
skill_md = skill_dir / "SKILL.md"
if skill_md.exists():
with open(skill_md, encoding='utf-8') as f:
with open(skill_md, encoding="utf-8") as f:
content = f.read()
metadata = {
"source": skill_dir.name,
"category": "overview",
"file_type": "skill_md"
}
metadata = {"source": skill_dir.name, "category": "overview", "file_type": "skill_md"}
chunks = self.chunk_document(content, metadata, source_file="SKILL.md")
all_chunks.extend(chunks)
@@ -169,26 +164,21 @@ class RAGChunker:
references_dir = skill_dir / "references"
if references_dir.exists():
for ref_file in references_dir.glob("*.md"):
with open(ref_file, encoding='utf-8') as f:
with open(ref_file, encoding="utf-8") as f:
content = f.read()
metadata = {
"source": skill_dir.name,
"category": ref_file.stem,
"file_type": "reference"
"file_type": "reference",
}
chunks = self.chunk_document(
content,
metadata,
source_file=str(ref_file.relative_to(skill_dir))
content, metadata, source_file=str(ref_file.relative_to(skill_dir))
)
all_chunks.extend(chunks)
logger.info(
f"Chunked skill directory {skill_dir.name}: "
f"{len(all_chunks)} total chunks"
)
logger.info(f"Chunked skill directory {skill_dir.name}: {len(all_chunks)} total chunks")
return all_chunks
@@ -207,32 +197,25 @@ class RAGChunker:
# Match code blocks (``` fenced blocks)
# Use DOTALL flag to match across newlines
code_block_pattern = r'```[^\n]*\n.*?```'
code_block_pattern = r"```[^\n]*\n.*?```"
def replacer(match):
idx = len(code_blocks)
code_blocks.append({
"index": idx,
"content": match.group(0),
"start": match.start(),
"end": match.end()
})
code_blocks.append(
{
"index": idx,
"content": match.group(0),
"start": match.start(),
"end": match.end(),
}
)
return placeholder_pattern.format(idx=idx)
text_with_placeholders = re.sub(
code_block_pattern,
replacer,
text,
flags=re.DOTALL
)
text_with_placeholders = re.sub(code_block_pattern, replacer, text, flags=re.DOTALL)
return text_with_placeholders, code_blocks
def _reinsert_code_blocks(
self,
chunks: list[str],
code_blocks: list[dict]
) -> list[str]:
def _reinsert_code_blocks(self, chunks: list[str], code_blocks: list[dict]) -> list[str]:
"""
Re-insert code blocks into chunks.
@@ -249,7 +232,7 @@ class RAGChunker:
for block in code_blocks:
placeholder = f"<<CODE_BLOCK_{block['index']}>>"
if placeholder in chunk:
chunk = chunk.replace(placeholder, block['content'])
chunk = chunk.replace(placeholder, block["content"])
result.append(chunk)
return result
@@ -268,15 +251,15 @@ class RAGChunker:
# Paragraph boundaries (double newline)
if self.preserve_paragraphs:
for match in re.finditer(r'\n\n+', text):
for match in re.finditer(r"\n\n+", text):
boundaries.append(match.end())
# Section headers (# Header)
for match in re.finditer(r'\n#{1,6}\s+.+\n', text):
for match in re.finditer(r"\n#{1,6}\s+.+\n", text):
boundaries.append(match.start())
# Single newlines (less preferred, but useful)
for match in re.finditer(r'\n', text):
for match in re.finditer(r"\n", text):
boundaries.append(match.start())
# Add artificial boundaries for large documents
@@ -352,7 +335,9 @@ class RAGChunker:
# Add chunk if it meets minimum size requirement
# (unless the entire text is smaller than target size)
if chunk_text.strip() and (len(text) <= target_size_chars or len(chunk_text) >= min_size_chars):
if chunk_text.strip() and (
len(text) <= target_size_chars or len(chunk_text) >= min_size_chars
):
chunks.append(chunk_text)
# Move to next chunk with overlap
@@ -383,7 +368,7 @@ class RAGChunker:
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
with open(output_path, "w", encoding="utf-8") as f:
json.dump(chunks, f, indent=2, ensure_ascii=False)
logger.info(f"Saved {len(chunks)} chunks to {output_path}")
@@ -393,7 +378,9 @@ def main():
"""CLI entry point for testing RAG chunker."""
import argparse
parser = argparse.ArgumentParser(description="RAG Chunker - Semantic chunking for RAG pipelines")
parser = argparse.ArgumentParser(
description="RAG Chunker - Semantic chunking for RAG pipelines"
)
parser.add_argument("skill_dir", type=Path, help="Path to skill directory")
parser.add_argument("--output", "-o", type=Path, help="Output JSON file")
parser.add_argument("--chunk-size", type=int, default=512, help="Target chunk size in tokens")