Custom Seeds Guide¶

Learn how to use custom seeds in SteadyText for reproducible variations in text generation and embeddings.

Overview¶

SteadyText uses seeds to control randomness, allowing you to: - Generate different outputs for the same prompt - Ensure reproducible results across runs - Create variations while maintaining determinism - Control randomness in production systems

Table of Contents¶

Understanding Seeds
What is a Seed?
Seed Behavior
Basic Seed Usage
Simple Text Generation
Embedding Generation
Reproducible Research
Research Workflow Example
A/B Testing with Seeds
Content Comparison Framework
Email Campaign Testing
Content Variations
Style and Tone Variations
Multi-Language Content
Embedding Experiments
Semantic Similarity Analysis
Domain-Specific Embedding Clusters
CLI Workflows
Batch Processing Scripts
Reproducible Research Pipeline
Advanced Patterns
Seed Scheduling and Management
Conditional Seed Strategies
Best Practices

Understanding Seeds¶

What is a Seed?¶

A seed is an integer that initializes the random number generator. Same seed + same input = same output, always.

import steadytext

# Default seed (42) - always same result
text1 = steadytext.generate("Hello world")
text2 = steadytext.generate("Hello world")
assert text1 == text2  # Always true

# Custom seeds - different results
text3 = steadytext.generate("Hello world", seed=123)
text4 = steadytext.generate("Hello world", seed=456)
assert text3 != text4  # Different seeds, different outputs

Seed Behavior¶

Deterministic: Same seed always produces same result
Independent: Each operation uses its own seed
Cascading: Seed affects all random choices in generation
Cross-platform: Same seed works identically everywhere

Basic Seed Usage¶

Simple Text Generation¶

import steadytext

# Default seed (42) - consistent across runs
text1 = steadytext.generate("Write a haiku about AI")
text2 = steadytext.generate("Write a haiku about AI")
assert text1 == text2  # Always identical

# Custom seed - reproducible but different from default
text3 = steadytext.generate("Write a haiku about AI", seed=123)
text4 = steadytext.generate("Write a haiku about AI", seed=123)
assert text3 == text4  # Same seed, same result
assert text1 != text3  # Different seeds, different results

print("Default seed result:", text1)
print("Custom seed result:", text3)

Embedding Generation¶

import numpy as np

# Default seed embeddings
emb1 = steadytext.embed("artificial intelligence")
emb2 = steadytext.embed("artificial intelligence")
assert np.array_equal(emb1, emb2)  # Identical

# Custom seed embeddings
emb3 = steadytext.embed("artificial intelligence", seed=456)
emb4 = steadytext.embed("artificial intelligence", seed=456)
assert np.array_equal(emb3, emb4)  # Same seed, same result
assert not np.array_equal(emb1, emb3)  # Different seeds, different embeddings

# Calculate similarity between different seed embeddings
similarity = np.dot(emb1, emb3)  # Cosine similarity (vectors are normalized)
print(f"Similarity between different seeds: {similarity:.3f}")

Reproducible Research¶

Research Workflow Example¶

import steadytext
import json
from datetime import datetime

class ReproducibleResearch:
    def __init__(self, base_seed=42):
        self.base_seed = base_seed
        self.current_seed = base_seed
        self.results = []
        self.metadata = {
            "start_time": datetime.now().isoformat(),
            "base_seed": base_seed,
            "steadytext_version": "2.1.0+",
        }

    def generate_with_logging(self, prompt, **kwargs):
        """Generate text and log the result with seed information."""
        result = steadytext.generate(prompt, seed=self.current_seed, **kwargs)

        self.results.append({
            "seed": self.current_seed,
            "prompt": prompt,
            "result": result,
            "kwargs": kwargs,
            "timestamp": datetime.now().isoformat()
        })

        self.current_seed += 1  # Increment for next generation
        return result

    def embed_with_logging(self, text, **kwargs):
        """Generate embedding and log the result with seed information."""
        embedding = steadytext.embed(text, seed=self.current_seed, **kwargs)

        self.results.append({
            "seed": self.current_seed,
            "text": text,
            "embedding": embedding.tolist(),  # Convert numpy array to list
            "kwargs": kwargs,
            "timestamp": datetime.now().isoformat()
        })

        self.current_seed += 1
        return embedding

    def save_results(self, filename):
        """Save all results to a JSON file for reproducibility."""
        with open(filename, 'w') as f:
            json.dump({
                "metadata": self.metadata,
                "results": self.results
            }, f, indent=2)

    def load_and_verify(self, filename):
        """Load previous results and verify reproducibility."""
        with open(filename, 'r') as f:
            data = json.load(f)

        print("Verifying reproducibility...")
        for result in data["results"]:
            if "prompt" in result:  # Text generation
                regenerated = steadytext.generate(
                    result["prompt"], 
                    seed=result["seed"],
                    **result["kwargs"]
                )
                if regenerated == result["result"]:
                    print(f"✓ Seed {result['seed']}: Text generation verified")
                else:
                    print(f"✗ Seed {result['seed']}: Text generation FAILED")

            elif "text" in result:  # Embedding
                regenerated = steadytext.embed(
                    result["text"],
                    seed=result["seed"],
                    **result["kwargs"]
                )
                if np.allclose(regenerated, result["embedding"], atol=1e-6):
                    print(f"✓ Seed {result['seed']}: Embedding verified")
                else:
                    print(f"✗ Seed {result['seed']}: Embedding FAILED")

# Usage example
research = ReproducibleResearch(base_seed=100)

# Conduct research with automatic seed management
research_prompts = [
    "Explain the benefits of renewable energy",
    "Describe the future of artificial intelligence",
    "Summarize the importance of biodiversity"
]

for prompt in research_prompts:
    result = research.generate_with_logging(prompt, max_new_tokens=200)
    print(f"Generated {len(result)} characters for: {prompt[:50]}...")

# Generate embeddings for analysis
embedding_texts = ["AI", "machine learning", "deep learning"]
for text in embedding_texts:
    embedding = research.embed_with_logging(text)
    print(f"Generated embedding for: {text}")

# Save results for reproducibility
research.save_results("research_results.json")
print("Results saved to research_results.json")

# Later: verify reproducibility
research.load_and_verify("research_results.json")

A/B Testing with Seeds¶

A/B testing is a powerful technique for comparing different variations of content. With SteadyText's deterministic seeds, you can create reproducible variations for testing.

Content Comparison Framework¶

Create a framework for systematic A/B testing of generated content.

import steadytext
import json
from datetime import datetime

class ABTestFramework:
    def __init__(self, base_prompt, variations=5, base_seed=42):
        self.base_prompt = base_prompt
        self.variations = variations
        self.base_seed = base_seed
        self.results = []

    def generate_variations(self):
        """Generate multiple variations of content using different seeds."""
        for i in range(self.variations):
            seed = self.base_seed + i
            content = steadytext.generate(self.base_prompt, seed=seed)

            self.results.append({
                "variation_id": f"variant_{chr(65+i)}",  # A, B, C, etc.
                "seed": seed,
                "content": content,
                "metrics": {
                    "length": len(content),
                    "word_count": len(content.split()),
                    "timestamp": datetime.now().isoformat()
                }
            })

        return self.results

    def compare_variations(self):
        """Compare all generated variations."""
        print(f"Generated {len(self.results)} variations for: {self.base_prompt[:50]}...")
        print("-" * 80)

        for result in self.results:
            print(f"\n{result['variation_id']} (seed: {result['seed']}):")
            print(f"Length: {result['metrics']['length']} chars")
            print(f"Words: {result['metrics']['word_count']}")
            print(f"Preview: {result['content'][:100]}...")

    def save_test_results(self, filename):
        """Save A/B test results for analysis."""
        with open(filename, 'w') as f:
            json.dump({
                "test_config": {
                    "base_prompt": self.base_prompt,
                    "variations": self.variations,
                    "base_seed": self.base_seed
                },
                "results": self.results
            }, f, indent=2)

# Example usage
ab_test = ABTestFramework(
    base_prompt="Write a compelling email subject line for our new product launch",
    variations=3
)

ab_test.generate_variations()
ab_test.compare_variations()
ab_test.save_test_results("ab_test_results.json")

Email Campaign Testing¶

Test different email variations with consistent seeding for reproducibility.

import steadytext

class EmailCampaignTester:
    def __init__(self, campaign_name, target_audience):
        self.campaign_name = campaign_name
        self.target_audience = target_audience
        self.templates = {}

    def generate_email_variant(self, tone, seed):
        """Generate email content with specific tone and seed."""
        prompt = f"""Write a marketing email for {self.campaign_name} targeting {self.target_audience}.
        Tone: {tone}
        Include: subject line, greeting, body, and call-to-action."""

        return steadytext.generate(prompt, seed=seed, max_new_tokens=400)

    def create_campaign_variants(self):
        """Create multiple email variants with different tones."""
        tones = ["professional", "friendly", "urgent", "casual", "exclusive"]

        for i, tone in enumerate(tones):
            seed = 1000 + i  # Consistent seed for each tone
            self.templates[tone] = {
                "seed": seed,
                "content": self.generate_email_variant(tone, seed),
                "tone": tone
            }

        return self.templates

    def test_personalization(self, template_tone, customer_names):
        """Test personalization with consistent results."""
        base_template = self.templates[template_tone]
        personalized = []

        for i, name in enumerate(customer_names):
            # Use customer-specific seed for personalization
            customer_seed = base_template["seed"] + hash(name) % 1000

            prompt = f"Personalize this email for {name}: {base_template['content'][:200]}..."
            personalized_content = steadytext.generate(prompt, seed=customer_seed, max_new_tokens=100)

            personalized.append({
                "customer": name,
                "seed": customer_seed,
                "preview": personalized_content[:100] + "..."
            })

        return personalized

# Example usage
tester = EmailCampaignTester("Summer Sale 2024", "young professionals")
variants = tester.create_campaign_variants()

# Test personalization
customers = ["Alice Johnson", "Bob Smith", "Carol Davis"]
personalized = tester.test_personalization("friendly", customers)

for p in personalized:
    print(f"Email for {p['customer']} (seed: {p['seed']}):")
    print(p['preview'])
    print()

Content Variations¶

Generate content in different styles, tones, and languages using seed-based variations.

Style and Tone Variations¶

Use different seeds to generate content with various stylistic approaches.

import steadytext

class StyleVariationGenerator:
    def __init__(self, base_content):
        self.base_content = base_content
        self.styles = {
            "formal": 2000,
            "casual": 2001,
            "technical": 2002,
            "creative": 2003,
            "minimalist": 2004
        }

    def generate_style_variant(self, style):
        """Generate content in a specific style."""
        if style not in self.styles:
            raise ValueError(f"Unknown style: {style}")

        seed = self.styles[style]
        prompt = f"Rewrite this in a {style} style: {self.base_content}"

        return steadytext.generate(prompt, seed=seed, max_new_tokens=300)

    def generate_all_styles(self):
        """Generate content in all available styles."""
        results = {}

        for style in self.styles:
            results[style] = {
                "seed": self.styles[style],
                "content": self.generate_style_variant(style)
            }

        return results

    def compare_lengths(self, results):
        """Compare the length of different style variants."""
        for style, data in results.items():
            word_count = len(data["content"].split())
            print(f"{style.capitalize()}: {word_count} words (seed: {data['seed']})")

# Example usage
base_text = "Our company provides innovative solutions for modern businesses."
generator = StyleVariationGenerator(base_text)

all_styles = generator.generate_all_styles()
generator.compare_lengths(all_styles)

# Show samples
for style, data in all_styles.items():
    print(f"\n{style.upper()} (seed: {data['seed']}):")
    print(data["content"][:150] + "...")

Multi-Language Content¶

Adapt content for different languages and cultural contexts using seeds.

import steadytext

class MultilingualContentGenerator:
    def __init__(self, source_content, source_language="English"):
        self.source_content = source_content
        self.source_language = source_language
        # Assign consistent seeds for each language
        self.language_seeds = {
            "Spanish": 3000,
            "French": 3001,
            "German": 3002,
            "Italian": 3003,
            "Portuguese": 3004,
            "Japanese": 3005,
            "Chinese": 3006
        }

    def translate_content(self, target_language):
        """Generate content adapted for target language."""
        if target_language not in self.language_seeds:
            raise ValueError(f"Unsupported language: {target_language}")

        seed = self.language_seeds[target_language]
        prompt = f"""Translate and culturally adapt this {self.source_language} content to {target_language}:

        {self.source_content}

        Maintain the tone and intent while making it natural for {target_language} speakers."""

        return steadytext.generate(prompt, seed=seed, max_new_tokens=400)

    def create_multilingual_set(self):
        """Create content in all supported languages."""
        translations = {
            self.source_language: {
                "seed": 2999,  # Original content seed
                "content": self.source_content
            }
        }

        for language in self.language_seeds:
            translations[language] = {
                "seed": self.language_seeds[language],
                "content": self.translate_content(language)
            }

        return translations

    def verify_consistency(self, language, expected_seed):
        """Verify that content generation is consistent for a language."""
        result1 = self.translate_content(language)
        result2 = self.translate_content(language)

        return result1 == result2  # Should be True due to same seed

# Example usage
content = "Welcome to our platform! We're excited to help you achieve your goals."
generator = MultilingualContentGenerator(content)

# Generate all translations
translations = generator.create_multilingual_set()

# Verify consistency
print("Consistency check:")
for lang in ["Spanish", "French", "German"]:
    is_consistent = generator.verify_consistency(lang, generator.language_seeds[lang])
    print(f"{lang}: {'✓' if is_consistent else '✗'}")

Embedding Experiments¶

Explore how seeds affect embeddings and use them for various analysis tasks.

Semantic Similarity Analysis¶

Analyze how different seeds affect the semantic representation of text.

import steadytext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class SemanticAnalyzer:
    def __init__(self):
        self.embeddings = {}

    def analyze_seed_impact(self, text, seeds):
        """Analyze how different seeds affect embeddings of the same text."""
        results = []

        for seed in seeds:
            embedding = steadytext.embed(text, seed=seed)
            self.embeddings[f"{text}_seed{seed}"] = embedding
            results.append({
                "seed": seed,
                "embedding": embedding,
                "norm": np.linalg.norm(embedding)
            })

        # Calculate pairwise similarities
        embeddings_matrix = np.array([r["embedding"] for r in results])
        similarity_matrix = cosine_similarity(embeddings_matrix)

        return {
            "text": text,
            "seeds": seeds,
            "embeddings": results,
            "similarity_matrix": similarity_matrix
        }

    def compare_semantic_drift(self, texts, base_seed=42, num_seeds=5):
        """Compare how much embeddings drift across seeds for different texts."""
        drift_analysis = []

        for text in texts:
            seeds = [base_seed + i for i in range(num_seeds)]
            embeddings = []

            for seed in seeds:
                emb = steadytext.embed(text, seed=seed)
                embeddings.append(emb)

            # Calculate average embedding and deviations
            avg_embedding = np.mean(embeddings, axis=0)
            deviations = [np.linalg.norm(emb - avg_embedding) for emb in embeddings]

            drift_analysis.append({
                "text": text,
                "avg_deviation": np.mean(deviations),
                "max_deviation": np.max(deviations),
                "min_deviation": np.min(deviations)
            })

        return drift_analysis

    def find_stable_pairs(self, text1, text2, num_seeds=10):
        """Find seed pairs that maintain relative similarity."""
        base_similarity = np.dot(
            steadytext.embed(text1, seed=42),
            steadytext.embed(text2, seed=42)
        )

        stable_pairs = []

        for i in range(num_seeds):
            seed1 = 100 + i
            seed2 = 200 + i

            emb1 = steadytext.embed(text1, seed=seed1)
            emb2 = steadytext.embed(text2, seed=seed2)
            similarity = np.dot(emb1, emb2)

            if abs(similarity - base_similarity) < 0.05:  # Within 5% of base
                stable_pairs.append({
                    "seed_pair": (seed1, seed2),
                    "similarity": similarity,
                    "difference": similarity - base_similarity
                })

        return stable_pairs

# Example usage
analyzer = SemanticAnalyzer()

# Analyze seed impact
result = analyzer.analyze_seed_impact("artificial intelligence", seeds=[42, 123, 456, 789])
print(f"Similarity matrix for '{result['text']}':")
print(result["similarity_matrix"])

# Compare drift across different texts
texts = ["AI", "machine learning", "deep learning", "neural networks"]
drift = analyzer.compare_semantic_drift(texts)
for d in drift:
    print(f"{d['text']}: avg deviation = {d['avg_deviation']:.4f}")

Domain-Specific Embedding Clusters¶

Create consistent embeddings for domain-specific text clustering.

import steadytext
import numpy as np
from collections import defaultdict

class DomainEmbeddingManager:
    def __init__(self):
        # Assign seed ranges to different domains
        self.domain_seeds = {
            "medical": 5000,
            "legal": 5100,
            "technical": 5200,
            "financial": 5300,
            "educational": 5400
        }
        self.embeddings = defaultdict(dict)

    def embed_domain_text(self, text, domain):
        """Embed text using domain-specific seed."""
        if domain not in self.domain_seeds:
            raise ValueError(f"Unknown domain: {domain}")

        seed = self.domain_seeds[domain]
        embedding = steadytext.embed(text, seed=seed)

        self.embeddings[domain][text] = embedding
        return embedding

    def create_domain_clusters(self, domain, texts):
        """Create embeddings for multiple texts in a domain."""
        clusters = []

        for i, text in enumerate(texts):
            # Use domain seed + index for consistency within domain
            seed = self.domain_seeds[domain] + i
            embedding = steadytext.embed(text, seed=seed)

            clusters.append({
                "text": text,
                "embedding": embedding,
                "seed": seed
            })

        return clusters

    def cross_domain_similarity(self, text):
        """Compare how the same text is embedded across domains."""
        results = {}

        for domain in self.domain_seeds:
            embedding = self.embed_domain_text(text, domain)
            results[domain] = embedding

        # Calculate cross-domain similarities
        similarities = {}
        domains = list(results.keys())

        for i in range(len(domains)):
            for j in range(i + 1, len(domains)):
                d1, d2 = domains[i], domains[j]
                sim = np.dot(results[d1], results[d2])
                similarities[f"{d1}-{d2}"] = sim

        return similarities

    def find_domain_keywords(self, domain, candidate_words):
        """Find words that cluster well within a domain."""
        domain_embeddings = []

        for word in candidate_words:
            emb = self.embed_domain_text(word, domain)
            domain_embeddings.append(emb)

        # Calculate centroid
        centroid = np.mean(domain_embeddings, axis=0)

        # Find words closest to centroid
        distances = []
        for i, word in enumerate(candidate_words):
            dist = np.linalg.norm(domain_embeddings[i] - centroid)
            distances.append((word, dist))

        # Sort by distance (closest first)
        distances.sort(key=lambda x: x[1])

        return distances[:10]  # Top 10 domain keywords

# Example usage
manager = DomainEmbeddingManager()

# Create domain-specific clusters
medical_terms = ["diagnosis", "treatment", "patient", "symptoms", "medication"]
medical_clusters = manager.create_domain_clusters("medical", medical_terms)

legal_terms = ["contract", "litigation", "defendant", "jurisdiction", "statute"]
legal_clusters = manager.create_domain_clusters("legal", legal_terms)

# Analyze cross-domain similarity
similarities = manager.cross_domain_similarity("analysis")
print("Cross-domain similarities for 'analysis':")
for pair, sim in similarities.items():
    print(f"{pair}: {sim:.3f}")

# Find domain keywords
candidates = ["research", "study", "analysis", "report", "findings", "evidence", 
              "data", "results", "conclusion", "methodology"]
medical_keywords = manager.find_domain_keywords("medical", candidates)
print("\nTop medical domain keywords:")
for word, dist in medical_keywords[:5]:
    print(f"{word}: {dist:.3f}")

CLI Workflows¶

Use SteadyText's CLI with custom seeds for batch processing and automation.

Batch Processing Scripts¶

Create shell scripts for processing multiple items with different seeds.

#!/bin/bash
# batch_generate.sh - Generate multiple variations with different seeds

# Configuration
BASE_PROMPT="Write a product description for"
PRODUCTS=("laptop" "smartphone" "headphones" "smartwatch" "tablet")
BASE_SEED=1000

# Create output directory
mkdir -p output/product_descriptions

# Generate descriptions for each product with multiple seeds
for i in "${!PRODUCTS[@]}"; do
    product="${PRODUCTS[$i]}"

    # Generate 3 variations per product
    for variation in 0 1 2; do
        seed=$((BASE_SEED + i * 10 + variation))
        output_file="output/product_descriptions/${product}_v${variation}.txt"

        echo "Generating description for $product (seed: $seed)..."
        echo "$BASE_PROMPT $product" | st generate --seed $seed > "$output_file"
    done
done

# Generate comparison report
echo "Product Description Variations Report" > output/report.txt
echo "====================================" >> output/report.txt
echo "" >> output/report.txt

for product in "${PRODUCTS[@]}"; do
    echo "## $product" >> output/report.txt
    for v in 0 1 2; do
        echo "### Variation $v:" >> output/report.txt
        head -n 3 "output/product_descriptions/${product}_v${v}.txt" >> output/report.txt
        echo "" >> output/report.txt
    done
done

Reproducible Research Pipeline¶

Build complete research workflows with seed management.

#!/usr/bin/env python3
# research_pipeline.py - Reproducible research pipeline with SteadyText

import subprocess
import json
import hashlib
from datetime import datetime
from pathlib import Path

class ResearchPipeline:
    def __init__(self, project_name, base_seed=42):
        self.project_name = project_name
        self.base_seed = base_seed
        self.output_dir = Path(f"research_{project_name}")
        self.output_dir.mkdir(exist_ok=True)

        # Initialize metadata
        self.metadata = {
            "project": project_name,
            "base_seed": base_seed,
            "start_time": datetime.now().isoformat(),
            "experiments": []
        }

    def run_experiment(self, name, prompts, seeds_per_prompt=3):
        """Run an experiment with multiple prompts and seeds."""
        experiment_data = {
            "name": name,
            "timestamp": datetime.now().isoformat(),
            "prompts": [],
            "results": []
        }

        for prompt_idx, prompt in enumerate(prompts):
            prompt_hash = hashlib.md5(prompt.encode()).hexdigest()[:8]

            for seed_offset in range(seeds_per_prompt):
                seed = self.base_seed + prompt_idx * 100 + seed_offset

                # Run generation via CLI
                result = subprocess.run(
                    ["st", "generate", "--seed", str(seed), "--json"],
                    input=prompt,
                    capture_output=True,
                    text=True
                )

                if result.returncode == 0:
                    output = json.loads(result.stdout)

                    experiment_data["results"].append({
                        "prompt": prompt,
                        "prompt_hash": prompt_hash,
                        "seed": seed,
                        "output": output["text"],
                        "metadata": output.get("metadata", {})
                    })
                else:
                    print(f"Error generating for seed {seed}: {result.stderr}")

        # Save experiment data
        exp_file = self.output_dir / f"experiment_{name}.json"
        with open(exp_file, 'w') as f:
            json.dump(experiment_data, f, indent=2)

        self.metadata["experiments"].append(name)
        return experiment_data

    def generate_embeddings(self, texts, name="embeddings"):
        """Generate embeddings for a list of texts."""
        embeddings_data = {
            "name": name,
            "timestamp": datetime.now().isoformat(),
            "embeddings": []
        }

        for idx, text in enumerate(texts):
            seed = self.base_seed + 10000 + idx

            # Run embedding via CLI
            result = subprocess.run(
                ["st", "embed", "--seed", str(seed), "--json"],
                input=text,
                capture_output=True,
                text=True
            )

            if result.returncode == 0:
                output = json.loads(result.stdout)
                embeddings_data["embeddings"].append({
                    "text": text,
                    "seed": seed,
                    "embedding": output["embedding"][:10],  # Store first 10 dims
                    "shape": output["shape"]
                })

        # Save embeddings data
        emb_file = self.output_dir / f"embeddings_{name}.json"
        with open(emb_file, 'w') as f:
            json.dump(embeddings_data, f, indent=2)

        return embeddings_data

    def finalize(self):
        """Finalize the research pipeline and save metadata."""
        self.metadata["end_time"] = datetime.now().isoformat()

        # Save metadata
        meta_file = self.output_dir / "metadata.json"
        with open(meta_file, 'w') as f:
            json.dump(self.metadata, f, indent=2)

        # Create summary report
        report = [
            f"# Research Pipeline Report: {self.project_name}",
            f"Generated on: {self.metadata['end_time']}",
            f"Base seed: {self.metadata['base_seed']}",
            "",
            "## Experiments Conducted:",
            ""
        ]

        for exp in self.metadata["experiments"]:
            report.append(f"- {exp}")

        report_file = self.output_dir / "REPORT.md"
        with open(report_file, 'w') as f:
            f.write('\n'.join(report))

        print(f"Research pipeline completed. Results in: {self.output_dir}")

# Example usage
if __name__ == "__main__":
    # Initialize pipeline
    pipeline = ResearchPipeline("climate_study", base_seed=2024)

    # Run text generation experiments
    climate_prompts = [
        "Explain the greenhouse effect in simple terms",
        "Describe renewable energy solutions",
        "What are the impacts of deforestation?"
    ]

    pipeline.run_experiment("climate_basics", climate_prompts)

    # Generate embeddings for key terms
    key_terms = [
        "climate change",
        "global warming",
        "carbon footprint",
        "sustainability",
        "renewable energy"
    ]

    pipeline.generate_embeddings(key_terms, "climate_terms")

    # Finalize and generate report
    pipeline.finalize()

Advanced Patterns¶

Advanced techniques for seed management in complex applications.

Seed Scheduling and Management¶

Implement sophisticated seed management for large-scale applications.

import hashlib
import time
from datetime import datetime, timedelta
from typing import Dict, List, Tuple

class SeedScheduler:
    def __init__(self, base_seed=42):
        self.base_seed = base_seed
        self.seed_registry = {}
        self.time_based_seeds = {}
        self.usage_stats = {}

    def register_task(self, task_name: str, seed_range: Tuple[int, int]):
        """Register a task with a specific seed range."""
        if task_name in self.seed_registry:
            raise ValueError(f"Task {task_name} already registered")

        start, end = seed_range
        # Check for overlaps
        for existing_task, (existing_start, existing_end) in self.seed_registry.items():
            if start <= existing_end and end >= existing_start:
                raise ValueError(f"Seed range overlaps with task {existing_task}")

        self.seed_registry[task_name] = seed_range
        self.usage_stats[task_name] = {"count": 0, "last_used": None}

    def get_task_seed(self, task_name: str, sub_id: str = None) -> int:
        """Get a seed for a specific task and optional sub-identifier."""
        if task_name not in self.seed_registry:
            raise ValueError(f"Task {task_name} not registered")

        start, end = self.seed_registry[task_name]

        if sub_id:
            # Hash the sub_id to get a consistent offset
            hash_val = int(hashlib.md5(sub_id.encode()).hexdigest(), 16)
            seed = start + (hash_val % (end - start))
        else:
            # Use sequential seeds
            count = self.usage_stats[task_name]["count"]
            seed = start + (count % (end - start))
            self.usage_stats[task_name]["count"] += 1

        self.usage_stats[task_name]["last_used"] = datetime.now()
        return seed

    def create_time_based_seed(self, task_name: str, interval: timedelta) -> int:
        """Create seeds that change based on time intervals."""
        current_time = datetime.now()

        if task_name in self.time_based_seeds:
            last_time, last_seed = self.time_based_seeds[task_name]
            if current_time - last_time < interval:
                return last_seed

        # Generate new seed for this time period
        time_bucket = int(current_time.timestamp() // interval.total_seconds())
        seed = self.get_task_seed(task_name, f"time_{time_bucket}")

        self.time_based_seeds[task_name] = (current_time, seed)
        return seed

    def get_user_seed(self, user_id: str, feature: str) -> int:
        """Get a consistent seed for a user-feature combination."""
        combined_id = f"{user_id}_{feature}"
        return self.get_task_seed("user_features", combined_id)

    def export_seed_map(self) -> Dict:
        """Export the current seed mapping for documentation."""
        return {
            "base_seed": self.base_seed,
            "registry": self.seed_registry,
            "usage_stats": {
                task: {
                    "count": stats["count"],
                    "last_used": stats["last_used"].isoformat() if stats["last_used"] else None
                }
                for task, stats in self.usage_stats.items()
            }
        }

# Example usage
scheduler = SeedScheduler(base_seed=1000)

# Register different tasks with non-overlapping seed ranges
scheduler.register_task("content_generation", (1000, 2000))
scheduler.register_task("embeddings", (2000, 3000))
scheduler.register_task("user_features", (3000, 4000))
scheduler.register_task("ab_testing", (4000, 5000))

# Get seeds for different purposes
content_seed = scheduler.get_task_seed("content_generation", "article_123")
embedding_seed = scheduler.get_task_seed("embeddings", "doc_456")
user_seed = scheduler.get_user_seed("user_789", "recommendations")

# Time-based seeds (changes every hour)
hourly_seed = scheduler.create_time_based_seed("ab_testing", timedelta(hours=1))

print(f"Content seed: {content_seed}")
print(f"Embedding seed: {embedding_seed}")
print(f"User seed: {user_seed}")
print(f"Hourly seed: {hourly_seed}")

# Export seed map for documentation
seed_map = scheduler.export_seed_map()
print("\nSeed Map:")
print(json.dumps(seed_map, indent=2))

Conditional Seed Strategies¶

Use different seeding strategies based on content characteristics.

import steadytext
import re
from enum import Enum
from typing import Optional

class ContentType(Enum):
    TECHNICAL = "technical"
    CREATIVE = "creative"
    BUSINESS = "business"
    CASUAL = "casual"
    ACADEMIC = "academic"

class ConditionalSeedStrategy:
    def __init__(self, base_seed=42):
        self.base_seed = base_seed

        # Define seed offsets for different content types
        self.content_type_offsets = {
            ContentType.TECHNICAL: 0,
            ContentType.CREATIVE: 1000,
            ContentType.BUSINESS: 2000,
            ContentType.CASUAL: 3000,
            ContentType.ACADEMIC: 4000
        }

        # Define seed modifiers for content characteristics
        self.modifiers = {
            "short": 0,
            "medium": 100,
            "long": 200,
            "formal": 0,
            "informal": 50,
            "urgent": 300,
            "evergreen": 400
        }

    def detect_content_type(self, text: str) -> ContentType:
        """Detect content type based on text characteristics."""
        text_lower = text.lower()

        # Simple heuristics for content type detection
        technical_keywords = ["algorithm", "function", "database", "api", "code"]
        creative_keywords = ["story", "imagine", "creative", "artistic", "design"]
        business_keywords = ["revenue", "market", "strategy", "customer", "roi"]
        academic_keywords = ["research", "study", "hypothesis", "analysis", "theory"]

        scores = {
            ContentType.TECHNICAL: sum(1 for kw in technical_keywords if kw in text_lower),
            ContentType.CREATIVE: sum(1 for kw in creative_keywords if kw in text_lower),
            ContentType.BUSINESS: sum(1 for kw in business_keywords if kw in text_lower),
            ContentType.ACADEMIC: sum(1 for kw in academic_keywords if kw in text_lower),
            ContentType.CASUAL: 1  # Default score
        }

        return max(scores, key=scores.get)

    def determine_length_category(self, text: str) -> str:
        """Determine if content should be short, medium, or long."""
        word_count = len(text.split())

        if word_count < 50:
            return "short"
        elif word_count < 200:
            return "medium"
        else:
            return "long"

    def determine_formality(self, text: str) -> str:
        """Determine if content should be formal or informal."""
        informal_indicators = ["you're", "don't", "can't", "won't", "!", "?"]
        informal_count = sum(1 for indicator in informal_indicators if indicator in text)

        return "informal" if informal_count > 2 else "formal"

    def calculate_seed(self, 
                      text: str, 
                      override_type: Optional[ContentType] = None,
                      urgency: bool = False,
                      evergreen: bool = False) -> int:
        """Calculate appropriate seed based on content characteristics."""
        # Determine content type
        content_type = override_type or self.detect_content_type(text)

        # Get base offset for content type
        seed = self.base_seed + self.content_type_offsets[content_type]

        # Add modifiers based on characteristics
        seed += self.modifiers[self.determine_length_category(text)]
        seed += self.modifiers[self.determine_formality(text)]

        if urgency:
            seed += self.modifiers["urgent"]
        elif evergreen:
            seed += self.modifiers["evergreen"]

        return seed

    def generate_with_strategy(self, 
                             prompt: str,
                             override_type: Optional[ContentType] = None,
                             **kwargs) -> str:
        """Generate content using conditional seed strategy."""
        seed = self.calculate_seed(prompt, override_type, 
                                 kwargs.get("urgency", False),
                                 kwargs.get("evergreen", False))

        # Remove our custom kwargs before passing to generate
        generate_kwargs = {k: v for k, v in kwargs.items() 
                         if k not in ["urgency", "evergreen"]}

        return steadytext.generate(prompt, seed=seed, **generate_kwargs)

    def batch_generate_variants(self, base_prompt: str) -> Dict[str, str]:
        """Generate variants for different content types."""
        variants = {}

        for content_type in ContentType:
            seed = self.calculate_seed(base_prompt, override_type=content_type)
            prompt = f"Write this in a {content_type.value} style: {base_prompt}"

            variants[content_type.value] = {
                "seed": seed,
                "content": steadytext.generate(prompt, seed=seed, max_new_tokens=200)
            }

        return variants

# Example usage
strategy = ConditionalSeedStrategy(base_seed=5000)

# Test content type detection and seed calculation
test_prompts = [
    "Explain how REST APIs work",
    "Write a creative story about the future",
    "Analyze market trends for Q4",
    "Hey, what's up with the weather today?",
    "Examine the hypothesis that climate change affects biodiversity"
]

for prompt in test_prompts:
    content_type = strategy.detect_content_type(prompt)
    seed = strategy.calculate_seed(prompt)
    print(f"Prompt: {prompt[:50]}...")
    print(f"Detected type: {content_type.value}, Seed: {seed}")
    print()

# Generate with strategy
technical_prompt = "Explain machine learning algorithms"
result = strategy.generate_with_strategy(
    technical_prompt,
    override_type=ContentType.TECHNICAL,
    max_new_tokens=150
)
print(f"Technical generation (seed: {strategy.calculate_seed(technical_prompt)}):")
print(result[:200] + "...")

# Generate variants for different styles
base_prompt = "Describe the benefits of cloud computing"
variants = strategy.batch_generate_variants(base_prompt)

print("\nContent variants:")
for style, data in variants.items():
    print(f"\n{style.upper()} (seed: {data['seed']}):")
    print(data['content'][:150] + "...")

Best Practices¶

Follow these best practices to make the most of custom seeds in SteadyText.

1. Documentation and Reproducibility¶

Always document your seed choices and their purposes for future reference.

# Good: Document seed usage
SEED_DOCUMENTATION = {
    "default": 42,
    "testing": {
        "unit_tests": 100,
        "integration_tests": 200,
        "performance_tests": 300
    },
    "production": {
        "content_generation": 1000,
        "embeddings": 2000,
        "personalization": 3000
    },
    "experiments": {
        "ab_test_2024_q1": 4000,
        "feature_rollout_v2": 5000
    }
}

# Create a seed manifest file
import json
with open("seeds.json", "w") as f:
    json.dump(SEED_DOCUMENTATION, f, indent=2)

2. Seed Range Management¶

Organize seeds into ranges to avoid conflicts and maintain clarity.

class SeedRanges:
    # Reserve ranges for different purposes
    TESTING = range(0, 1000)
    DEVELOPMENT = range(1000, 2000)
    PRODUCTION = range(2000, 10000)
    USER_SPECIFIC = range(10000, 20000)
    TIME_BASED = range(20000, 30000)
    EXPERIMENTAL = range(30000, 40000)

    @staticmethod
    def validate_seed(seed, purpose):
        """Ensure seed is in correct range for its purpose."""
        ranges = {
            "test": SeedRanges.TESTING,
            "dev": SeedRanges.DEVELOPMENT,
            "prod": SeedRanges.PRODUCTION,
            "user": SeedRanges.USER_SPECIFIC,
            "time": SeedRanges.TIME_BASED,
            "exp": SeedRanges.EXPERIMENTAL
        }

        if purpose in ranges and seed in ranges[purpose]:
            return True
        return False

3. Testing and Validation¶

Regularly validate that your seed-based workflows remain reproducible.

import steadytext
import hashlib

def validate_seed_reproducibility(test_cases):
    """Validate that seeds produce consistent results."""
    failures = []

    for test in test_cases:
        prompt = test["prompt"]
        seed = test["seed"]
        expected_hash = test.get("expected_hash")

        # Generate twice with same seed
        result1 = steadytext.generate(prompt, seed=seed)
        result2 = steadytext.generate(prompt, seed=seed)

        # Check consistency
        if result1 != result2:
            failures.append(f"Inconsistent results for seed {seed}")

        # Check against expected hash if provided
        if expected_hash:
            actual_hash = hashlib.md5(result1.encode()).hexdigest()
            if actual_hash != expected_hash:
                failures.append(f"Hash mismatch for seed {seed}")

    return len(failures) == 0, failures

# Test cases
test_cases = [
    {"prompt": "Hello", "seed": 42, "expected_hash": "abc123..."},
    {"prompt": "Test prompt", "seed": 100},
    {"prompt": "Another test", "seed": 200}
]

is_valid, errors = validate_seed_reproducibility(test_cases)
if not is_valid:
    print("Validation failed:", errors)

This comprehensive guide demonstrates the power and flexibility of custom seeds in SteadyText. By using seeds strategically, you can achieve reproducible research, conduct effective A/B testing, generate controlled variations, and build robust content generation pipelines.