bible-epub-processing
SKILL.md
Bible EPUB Processing
Overview
Parse and extract structured content from New World Translation (NWT) Bible EPUBs to build parallel corpora for Chuukese-English translation training. The NWT is available in both Chuukese (nwt_TE.epub) and English (nwt_E.epub), providing high-quality aligned translations.
File Locations
data/bible/
├── nwt_E.epub # English NWT Bible
└── nwt_TE.epub # Chuukese (Trukese) NWT Bible
EPUB Structure
NWT EPUBs follow a specific structure:
nwt_X.epub/
├── META-INF/
│ └── container.xml
├── OEBPS/
│ ├── content.opf
│ ├── toc.ncx
│ └── OEBPS/
│ ├── 01_Genesis.xhtml
│ ├── 02_Exodus.xhtml
│ └── ... (66 books)
NWT EPUB Parser
Core Parser Class
# src/utils/nwt_epub_parser.py
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import re
from typing import Dict, List, Optional, Tuple
import logging
logger = logging.getLogger(__name__)
class NWTEpubParser:
"""
Parser for New World Translation Bible EPUBs.
Extracts books, chapters, and verses with proper structure.
"""
# Book order in NWT (66 books)
BOOK_ORDER = [
"Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy",
"Joshua", "Judges", "Ruth", "1 Samuel", "2 Samuel",
"1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles",
"Ezra", "Nehemiah", "Esther", "Job", "Psalms", "Proverbs",
"Ecclesiastes", "Song of Solomon", "Isaiah", "Jeremiah",
"Lamentations", "Ezekiel", "Daniel", "Hosea", "Joel", "Amos",
"Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah",
"Haggai", "Zechariah", "Malachi", "Matthew", "Mark", "Luke",
"John", "Acts", "Romans", "1 Corinthians", "2 Corinthians",
"Galatians", "Ephesians", "Philippians", "Colossians",
"1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy",
"Titus", "Philemon", "Hebrews", "James", "1 Peter", "2 Peter",
"1 John", "2 John", "3 John", "Jude", "Revelation"
]
def __init__(self, epub_path: str):
self.epub_path = epub_path
self.book = epub.read_epub(epub_path)
self._book_cache = {}
self._chapter_cache = {}
logger.info(f"Loaded EPUB: {epub_path}")
def get_book_list(self) -> List[str]:
"""Get list of available books in the EPUB."""
books = []
for item in self.book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
name = item.get_name()
# Extract book name from filename
match = re.search(r'\d+_(\w+)\.xhtml', name)
if match:
books.append(match.group(1))
return books
def get_chapters(self, book_name: str) -> List[int]:
"""Get list of chapter numbers for a book."""
content = self._get_book_content(book_name)
if not content:
return []
soup = BeautifulSoup(content, 'lxml')
# Find chapter markers
chapters = set()
for elem in soup.find_all(class_=re.compile(r'chapter|chapterNum')):
text = elem.get_text(strip=True)
try:
chapter_num = int(re.search(r'\d+', text).group())
chapters.add(chapter_num)
except (ValueError, AttributeError):
continue
return sorted(chapters)
def get_verses(self, book_name: str, chapter: int) -> Dict[int, str]:
"""
Get all verses for a specific chapter.
Returns:
Dictionary mapping verse numbers to verse text
"""
content = self._get_book_content(book_name)
if not content:
return {}
soup = BeautifulSoup(content, 'lxml')
verses = {}
# Find the chapter section
chapter_section = self._find_chapter_section(soup, chapter)
if not chapter_section:
return {}
# Extract verses
for verse_elem in chapter_section.find_all(class_=re.compile(r'verse|v\d+')):
verse_num = self._extract_verse_number(verse_elem)
if verse_num:
verse_text = self._extract_verse_text(verse_elem)
if verse_text:
verses[verse_num] = verse_text
return verses
def get_verse(self, book_name: str, chapter: int, verse: int) -> Optional[str]:
"""Get a specific verse."""
verses = self.get_verses(book_name, chapter)
return verses.get(verse)
def get_verse_range(
self,
book_name: str,
chapter: int,
start_verse: int,
end_verse: int
) -> str:
"""Get a range of verses as combined text."""
verses = self.get_verses(book_name, chapter)
verse_texts = []
for v in range(start_verse, end_verse + 1):
if v in verses:
verse_texts.append(verses[v])
return ' '.join(verse_texts)
def _get_book_content(self, book_name: str) -> Optional[str]:
"""Get raw HTML content for a book."""
if book_name in self._book_cache:
return self._book_cache[book_name]
for item in self.book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
if book_name.lower() in item.get_name().lower():
content = item.get_content().decode('utf-8')
self._book_cache[book_name] = content
return content
return None
def _find_chapter_section(self, soup: BeautifulSoup, chapter: int):
"""Find the DOM section for a specific chapter."""
# Different strategies for finding chapter boundaries
# Strategy 1: Look for chapter heading
for heading in soup.find_all(['h2', 'h3', 'div'], class_=re.compile(r'chapter')):
if str(chapter) in heading.get_text():
return heading.find_parent('section') or heading.find_parent('div')
# Strategy 2: Look for chapter number span
for span in soup.find_all('span', class_=re.compile(r'chapterNum')):
if span.get_text(strip=True) == str(chapter):
return span.find_parent('section') or span.find_parent('div')
return soup # Return whole document if chapter not found
def _extract_verse_number(self, elem) -> Optional[int]:
"""Extract verse number from element."""
# Check for verse number class
verse_class = elem.get('class', [])
for cls in verse_class:
match = re.search(r'v(\d+)', cls)
if match:
return int(match.group(1))
# Check for data attribute
verse_num = elem.get('data-verse')
if verse_num:
return int(verse_num)
# Check for sup element with verse number
sup = elem.find('sup')
if sup:
try:
return int(sup.get_text(strip=True))
except ValueError:
pass
return None
def _extract_verse_text(self, elem) -> str:
"""Extract clean verse text from element."""
# Remove verse number and footnote markers
text = elem.get_text(separator=' ', strip=True)
# Clean up the text
text = re.sub(r'^\d+\s*', '', text) # Remove leading verse number
text = re.sub(r'\*+', '', text) # Remove footnote markers
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
return text.strip()
Parallel Corpus Builder
class ParallelCorpusBuilder:
"""
Build parallel corpora from Chuukese and English NWT EPUBs.
"""
def __init__(self, chuukese_epub: str, english_epub: str):
self.chk_parser = NWTEpubParser(chuukese_epub)
self.en_parser = NWTEpubParser(english_epub)
def build_parallel_corpus(
self,
books: List[str] = None,
min_length: int = 10,
max_length: int = 500
) -> List[Dict]:
"""
Build parallel verse pairs.
Args:
books: List of books to include (None = all)
min_length: Minimum verse length in characters
max_length: Maximum verse length in characters
Returns:
List of parallel pairs with metadata
"""
if books is None:
books = self.chk_parser.get_book_list()
pairs = []
for book in books:
logger.info(f"Processing {book}...")
chapters = self.chk_parser.get_chapters(book)
for chapter in chapters:
chk_verses = self.chk_parser.get_verses(book, chapter)
en_verses = self.en_parser.get_verses(book, chapter)
# Align verses by number
for verse_num in chk_verses:
if verse_num in en_verses:
chk_text = chk_verses[verse_num]
en_text = en_verses[verse_num]
# Filter by length
if (min_length <= len(chk_text) <= max_length and
min_length <= len(en_text) <= max_length):
pairs.append({
'chuukese': chk_text,
'english': en_text,
'reference': f"{book} {chapter}:{verse_num}",
'book': book,
'chapter': chapter,
'verse': verse_num,
'source': 'nwt_bible',
'confidence': 0.95 # High quality translations
})
logger.info(f"Built {len(pairs)} parallel pairs")
return pairs
def export_for_training(
self,
output_dir: str,
direction: str = "chk_to_en",
test_split: float = 0.1,
val_split: float = 0.1
) -> Dict[str, int]:
"""
Export parallel corpus for model training.
Args:
output_dir: Directory for output files
direction: 'chk_to_en' or 'en_to_chk'
test_split: Fraction for test set
val_split: Fraction for validation set
Returns:
Statistics about exported data
"""
import os
from sklearn.model_selection import train_test_split
os.makedirs(output_dir, exist_ok=True)
pairs = self.build_parallel_corpus()
# Split data
train, test = train_test_split(pairs, test_size=test_split, random_state=42)
train, val = train_test_split(train, test_size=val_split/(1-test_split), random_state=42)
# Export TSV files
for split_name, split_data in [('train', train), ('val', val), ('test', test)]:
filepath = os.path.join(output_dir, f'{split_name}.tsv')
with open(filepath, 'w', encoding='utf-8') as f:
for pair in split_data:
if direction == "chk_to_en":
source = pair['chuukese']
target = pair['english']
else:
source = pair['english']
target = pair['chuukese']
f.write(f"{source}\t{target}\n")
return {
'total': len(pairs),
'train': len(train),
'val': len(val),
'test': len(test)
}
def export_jsonl(self, output_path: str, format_type: str = "ollama") -> int:
"""
Export parallel corpus as JSONL for LLM training.
Args:
output_path: Path for output file
format_type: 'ollama', 'openai', or 'huggingface'
Returns:
Number of examples exported
"""
import json
pairs = self.build_parallel_corpus()
with open(output_path, 'w', encoding='utf-8') as f:
for pair in pairs:
if format_type == "ollama":
example = {
"prompt": f"Translate this Chuukese verse to English: {pair['chuukese']}",
"response": pair['english'],
"system": "You are a Chuukese-English Bible translator."
}
elif format_type == "openai":
example = {
"messages": [
{"role": "system", "content": "You are a Chuukese-English translator."},
{"role": "user", "content": f"Translate: {pair['chuukese']}"},
{"role": "assistant", "content": pair['english']}
]
}
else: # huggingface
example = {
"text": f"### Instruction:\nTranslate this Chuukese text to English.\n\n### Input:\n{pair['chuukese']}\n\n### Response:\n{pair['english']}"
}
f.write(json.dumps(example, ensure_ascii=False) + '\n')
return len(pairs)
Usage Examples
Basic Verse Extraction
from src.utils.nwt_epub_parser import NWTEpubParser
# Load English Bible
parser = NWTEpubParser('data/bible/nwt_E.epub')
# Get list of books
books = parser.get_book_list()
print(f"Found {len(books)} books")
# Get chapters in Genesis
chapters = parser.get_chapters('Genesis')
print(f"Genesis has {len(chapters)} chapters")
# Get all verses in Genesis 1
verses = parser.get_verses('Genesis', 1)
print(f"Genesis 1 has {len(verses)} verses")
# Get specific verse
verse = parser.get_verse('Genesis', 1, 1)
print(f"Genesis 1:1 - {verse}")
Building Parallel Corpus
from src.utils.nwt_epub_parser import ParallelCorpusBuilder
# Build corpus from both Bibles
builder = ParallelCorpusBuilder(
chuukese_epub='data/bible/nwt_TE.epub',
english_epub='data/bible/nwt_E.epub'
)
# Get all parallel pairs
pairs = builder.build_parallel_corpus()
print(f"Total parallel pairs: {len(pairs)}")
# Export for Helsinki-NLP training
stats = builder.export_for_training(
output_dir='training_data/bible',
direction='chk_to_en'
)
print(f"Train: {stats['train']}, Val: {stats['val']}, Test: {stats['test']}")
Exporting for LLM Training
# Export for Ollama fine-tuning
count = builder.export_jsonl(
output_path='training_data/bible_ollama.jsonl',
format_type='ollama'
)
print(f"Exported {count} examples for Ollama")
Quality Considerations
High-Quality Pairs
- Bible translations are professionally reviewed
- Consistent terminology across verses
- Good for formal/religious register
Potential Issues
- Religious vocabulary may not generalize
- Formal language style
- Some verses have complex structures
Filtering Strategies
def filter_pairs(pairs):
"""Filter pairs for training quality."""
filtered = []
for pair in pairs:
# Skip very short pairs
if len(pair['chuukese']) < 20 or len(pair['english']) < 20:
continue
# Skip pairs with unusual length ratios
ratio = len(pair['chuukese']) / len(pair['english'])
if ratio < 0.5 or ratio > 2.0:
continue
# Skip pairs with mostly numbers
if sum(c.isdigit() for c in pair['chuukese']) / len(pair['chuukese']) > 0.3:
continue
filtered.append(pair)
return filtered
Dependencies
ebooklib>=0.18: EPUB parsingbeautifulsoup4>=4.12.0: HTML parsinglxml>=4.9.0: XML/HTML parserscikit-learn>=1.0.0: Data splitting (optional)
Weekly Installs
7
Repository
findinfinitelabs/chuukFirst Seen
14 days ago
Security Audits
Installed on
opencode7
gemini-cli7
github-copilot7
codex7
amp7
cline7