import pybiber as pb
# Initialize pipeline with optimized settings
= pb.PybiberPipeline(
pipeline ="en_core_web_sm",
model=True, # Faster processing
disable_ner=4, # Use multiple cores
n_process=100 # Optimize batch size
batch_size
)
# Process entire directory structure
= pipeline.run_from_folder(
features "corpus/",
=True,
recursive=True
normalize )
Tutorials and Advanced Usage
This page provides comprehensive tutorials for advanced pybiber workflows, from corpus preparation to statistical analysis and visualization.
Tutorial 1: Building and Processing Large Corpora
Working with Directory Structures
When working with large corpora, organizing your texts in a systematic directory structure is crucial:
corpus/
├── academic/
│ ├── biology/
│ │ ├── paper001.txt
│ │ └── paper002.txt
│ └── literature/
│ ├── essay001.txt
│ └── essay002.txt
└── news/
├── politics/
└── sports/
Recursive Text Processing
Use the pipeline to process nested directories:
Handling Different Text Formats
While pybiber primarily works with .txt
files, you can preprocess other formats:
import polars as pl
from pathlib import Path
# Process CSV with text columns
= pl.read_csv("articles.csv")
csv_data = csv_data.select([
corpus "article_id").alias("doc_id"),
pl.col("content").alias("text")
pl.col(
])
# Process with pipeline
= pb.PybiberPipeline()
pipeline = pipeline.run(corpus) features
Tutorial 2: Corpus Comparison and Classification
Comparing Multiple Corpora
import pybiber as pb
import polars as pl
# Load multiple corpora
= pb.corpus_from_folder("academic_texts/")
academic_corpus = pb.corpus_from_folder("news_texts/")
news_corpus
# Add corpus labels
= academic_corpus.with_columns(
academic_corpus "academic").alias("corpus_type")
pl.lit(
)= news_corpus.with_columns(
news_corpus "news").alias("corpus_type")
pl.lit(
)
# Combine and process
= pl.concat([academic_corpus, news_corpus])
combined_corpus = pb.PybiberPipeline()
pipeline = pipeline.run(combined_corpus)
features
# Extract corpus labels for analysis
= features.with_columns(
features "corpus_type")
combined_corpus.select( )
Feature-Based Classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Prepare data for classification
= features.select(pl.selectors.numeric()).to_numpy()
X = features.get_column("corpus_type").to_numpy()
y
# Split data
= train_test_split(
X_train, X_test, y_train, y_test =0.2, random_state=42
X, y, test_size
)
# Train classifier
= RandomForestClassifier(n_estimators=100, random_state=42)
clf
clf.fit(X_train, y_train)
# Evaluate
= clf.predict(X_test)
y_pred print(classification_report(y_test, y_pred))
Tutorial 3: Advanced Multi-Dimensional Analysis
Custom MDA Workflows
import pybiber as pb
import polars as pl
from pybiber.data import micusp_mini
# Process sample data
= pb.PybiberPipeline()
pipeline = pipeline.run(micusp_mini)
features
# Extract discipline information
= features.with_columns(
features "doc_id").str.extract(r"^([A-Z]+)", 0).alias("discipline")
pl.col(
)
# Initialize analyzer
= pb.BiberAnalyzer(features, id_column=True) analyzer
[INFO] Using MATTR for f_43_type_token
[INFO] All features normalized per 1000 tokens except: f_43_type_token and f_44_mean_word_length
Performance: Corpus processing completed in 74.85s
Customizing Factor Analysis Parameters
# Experiment with different correlation thresholds
=4, cor_min=0.3, threshold=0.4)
analyzer.mda(n_factors
# Examine the effect on feature selection
print("Features after correlation filtering:")
print(analyzer.mda_summary.shape)
INFO:pybiber.biber_analyzer:Dropping 11 variable(s) with max |r| <= 0.30: ['f_04_place_adverbials', 'f_05_time_adverbials', 'f_15_gerunds', 'f_18_by_passives', 'f_25_present_participle', 'f_34_sentence_relatives', 'f_35_because', 'f_46_downtoners', 'f_50_discourse_particles', 'f_53_modal_necessity', 'f_64_phrasal_coordination']
Features after correlation filtering:
(4, 6)
Comparing Factor Solutions
import matplotlib.pyplot as plt
# Compare different numbers of factors
= plt.subplots(2, 2, figsize=(12, 10))
fig, axes
for i, n_factors in enumerate([2, 3, 4, 5]):
=n_factors)
analyzer.mda(n_factors= axes[i // 2, i % 2]
ax
# Plot scree plot for each solution
analyzer.mdaviz_screeplot()
plt.sca(ax)f"{n_factors} Factor Solution")
ax.set_title(
plt.tight_layout() plt.show()
Tutorial 4: Temporal and Diachronic Analysis
Analyzing Language Change Over Time
import polars as pl
# Prepare time-stamped corpus
= pl.read_csv("historical_texts.csv")
corpus = corpus.with_columns([
corpus "year").cast(pl.Int32),
pl.col("decade").cast(pl.String)
pl.col(
])
# Process with pybiber
= pb.PybiberPipeline()
pipeline = pipeline.run(corpus)
features
# Add temporal metadata
= features.join(
features "doc_id", "year", "decade"]),
corpus.select([="doc_id"
on
)
# Analyze by decade
= pb.BiberAnalyzer(
decade_analysis "year"),
features.drop(=True
id_column
)
# Examine temporal dimensions
=3)
decade_analysis.mda(n_factors=1) decade_analysis.mdaviz_groupmeans(factor
Trend Analysis
# Calculate feature trends over time
= (
trends
features"decade")
.group_by(
.agg(["_mean"),
pl.selectors.numeric().mean().suffix("_std")
pl.selectors.numeric().std().suffix(
])"decade")
.sort(
)
# Visualize specific feature trends
import matplotlib.pyplot as plt
=(10, 6))
plt.figure(figsize"decade"], trends["f_01_past_tense_mean"],
plt.plot(trends[='o', label='Past Tense')
marker"decade"], trends["f_03_present_tense_mean"],
plt.plot(trends[='s', label='Present Tense')
marker"Decade")
plt.xlabel("Normalized Frequency")
plt.ylabel(
plt.legend()"Tense Usage Over Time")
plt.title(=45)
plt.xticks(rotation plt.show()
Tutorial 5: Cross-Linguistic and Multilingual Analysis
Comparing Languages
# Process different language corpora
= pb.PybiberPipeline(model="en_core_web_sm")
english_pipeline = pb.PybiberPipeline(model="es_core_news_sm")
spanish_pipeline
= english_pipeline.run_from_folder("english_texts/")
english_features = spanish_pipeline.run_from_folder("spanish_texts/")
spanish_features
# Add language labels
= english_features.with_columns(
english_features "English").alias("language")
pl.lit(
)= spanish_features.with_columns(
spanish_features "Spanish").alias("language")
pl.lit(
)
# Combine for comparative analysis
= pl.concat([english_features, spanish_features]) multilingual_features
Language-Specific Adaptations
# Customize feature extraction for specific languages
def extract_language_specific_features(tokens, language="en"):
= pb.biber(tokens, normalize=True)
base_features
if language == "es":
# Add Spanish-specific features
= extract_spanish_subjunctive(tokens)
spanish_features = base_features.join(spanish_features, on="doc_id")
base_features
return base_features
Tutorial 6: Statistical Validation and Robustness
Cross-Validation of Factor Solutions
from sklearn.model_selection import KFold
import numpy as np
def validate_factor_solution(features, n_factors=3, n_splits=5):
"""Cross-validate factor stability."""
= KFold(n_splits=n_splits, shuffle=True, random_state=42)
kf = []
loadings_stability
for train_idx, _ in kf.split(features):
# Sample training data
= features[train_idx]
train_features
# Fit MDA
= pb.BiberAnalyzer(train_features)
analyzer =n_factors)
analyzer.mda(n_factors
# Store loadings
loadings_stability.append(analyzer.mda_loadings)
# Calculate loading stability metrics
return analyze_loading_stability(loadings_stability)
Bootstrap Confidence Intervals
def bootstrap_mda_confidence(features, n_bootstrap=1000):
"""Calculate bootstrap confidence intervals for factor loadings."""
= []
bootstrap_loadings = features.shape[0]
n_docs
for i in range(n_bootstrap):
# Resample with replacement
= np.random.choice(n_docs, n_docs, replace=True)
sample_idx = features[sample_idx]
boot_features
# Fit MDA
= pb.BiberAnalyzer(boot_features)
analyzer =3)
analyzer.mda(n_factors
bootstrap_loadings.append(analyzer.mda_loadings)
# Calculate confidence intervals
return calculate_loading_confidence(bootstrap_loadings)
Tutorial 7: Performance Optimization
Memory-Efficient Processing
# Process large corpora in chunks
def process_large_corpus(corpus_path, chunk_size=1000):
"""Process large corpus in memory-efficient chunks."""
# Get all text files
= list(Path(corpus_path).rglob("*.txt"))
text_files = []
all_features
# Process in chunks
for i in range(0, len(text_files), chunk_size):
= text_files[i:i + chunk_size]
chunk_files
# Create temporary corpus
= pb.readtext(chunk_files)
chunk_corpus
# Process chunk
= pb.PybiberPipeline()
pipeline = pipeline.run(chunk_corpus)
chunk_features
all_features.append(chunk_features)
# Clear memory
del chunk_corpus, chunk_features
# Combine all features
return pl.concat(all_features)
Parallel Processing Optimization
# Optimize parallel processing parameters
def find_optimal_batch_size(corpus, model="en_core_web_sm"):
"""Find optimal batch size for your system."""
import time
= [10, 50, 100, 200, 500]
batch_sizes = []
processing_times
for batch_size in batch_sizes:
= pb.PybiberPipeline(
pipeline =model,
model=batch_size,
batch_size=4
n_process
)
= time.time()
start_time = pipeline.run(corpus.head(1000)) # Test subset
_ = time.time()
end_time
- start_time)
processing_times.append(end_time
# Find optimal batch size
= np.argmin(processing_times)
optimal_idx return batch_sizes[optimal_idx]
Best Practices Summary
Data Preparation
- Organize texts in systematic directory structures
- Encode metadata in filenames or separate files
- Clean text appropriately for your spaCy model
- Validate corpus structure before processing
Feature Extraction
- Choose appropriate spaCy models for your texts
- Consider disabling unnecessary components (like NER) for speed
- Use appropriate normalization (per 1000 tokens vs. absolute counts)
- Monitor memory usage with large corpora
Statistical Analysis
- Examine scree plots before selecting number of factors
- Validate factor solutions with multiple approaches
- Consider cross-validation for robust results
- Document all analytical decisions and parameters
Performance Optimization
- Experiment with batch sizes and parallel processing
- Process large corpora in chunks if memory is limited
- Monitor system resources during processing
- Cache intermediate results when possible
These tutorials provide a foundation for advanced pybiber usage. Adapt these patterns to your specific research questions and computational constraints.