01 Setup & Connect

Load the required libraries and connect to the DuckDB database containing primitives.

# Install packages (first time only)
# install.packages(c("duckdb", "dplyr", "ggplot2", "tidyr"))

# Load libraries
library(duckdb)
library(dplyr)
library(ggplot2)
library(tidyr)

# Connect to DuckDB (read-only)
con <- dbConnect(
  duckdb(),
  "path/to/primitives_snapshot.duckdb",
  read_only = TRUE
)

# List available tables
dbListTables(con)
[1] "filings" "primitives" "tickers"
Tip

Always use read_only = TRUE when analyzing. It's faster and prevents accidental writes.

02 Explore the Data

Understand the schema and available primitives before diving in.

Check table schemas

# Filings table structure
dbGetQuery(con, "DESCRIBE filings")

# Primitives table structure
dbGetQuery(con, "DESCRIBE primitives")

Get filing counts by form type

dbGetQuery(con, "
  SELECT form, COUNT(*) as n, MIN(filing_date) as earliest, MAX(filing_date) as latest
  FROM filings
  GROUP BY form
  ORDER BY n DESC
")

List all available primitives

# Get column names from primitives table (each column is a primitive)
primitives <- dbGetQuery(con, "SELECT * FROM primitives LIMIT 1")
primitive_names <- names(primitives)
print(primitive_names)

03 Single Signal Analysis

Pull one primitive for a specific ticker and analyze its trend over time.

# Get revenue declining signal for Apple
aapl_signal <- dbGetQuery(con, "
  SELECT
    f.filing_date,
    f.form,
    p.is_revenue_declining,
    p.num_sentences
  FROM filings f
  JOIN primitives p USING (accession_number)
  WHERE f.ticker = 'AAPL'
    AND f.form IN ('10-K', '10-Q')
  ORDER BY f.filing_date
")

# Normalize by sentence count (percentage of filing)
aapl_signal <- aapl_signal %>%
  mutate(
    signal_pct = is_revenue_declining / num_sentences * 100
  )

# View the trend
print(aapl_signal)
Why normalize?

Raw counts vary with filing length. A 10-K has ~3,000 sentences vs ~800 for a 10-Q. Always normalize by num_sentences for fair comparison.

04 Z-Score Normalization

Compare a company's signal to the market or sector baseline using z-scores.

# Calculate z-scores vs all companies in same quarter
z_scores <- dbGetQuery(con, "
  WITH quarterly_stats AS (
    SELECT
      DATE_TRUNC('quarter', f.filing_date) as quarter,
      AVG(p.is_revenue_declining / p.num_sentences) as mean_signal,
      STDDEV(p.is_revenue_declining / p.num_sentences) as std_signal
    FROM filings f
    JOIN primitives p USING (accession_number)
    WHERE f.form IN ('10-K', '10-Q')
      AND p.num_sentences > 200
    GROUP BY quarter
  )
  SELECT
    f.ticker,
    f.filing_date,
    DATE_TRUNC('quarter', f.filing_date) as quarter,
    p.is_revenue_declining / p.num_sentences as raw_signal,
    qs.mean_signal,
    qs.std_signal,
    (p.is_revenue_declining / p.num_sentences - qs.mean_signal) / NULLIF(qs.std_signal, 0) as z_score
  FROM filings f
  JOIN primitives p USING (accession_number)
  JOIN quarterly_stats qs ON DATE_TRUNC('quarter', f.filing_date) = qs.quarter
  WHERE f.ticker = 'XRX'
    AND f.form IN ('10-K', '10-Q')
  ORDER BY f.filing_date
")

print(z_scores)
ticker filing_date quarter raw_signal mean_signal std_signal z_score 1 XRX 2023-02-22 2023-01-01 0.0312 0.0089 0.0098 2.28 2 XRX 2023-05-09 2023-04-01 0.0298 0.0091 0.0095 2.18 3 XRX 2023-08-08 2023-07-01 0.0345 0.0087 0.0092 2.80

A z-score of +2.8 means XRX's "revenue declining" language is 2.8 standard deviations above the market average — a strong distress signal.

05 Composite Scoring

Combine multiple primitives into a single "Storm Score" for identifying distressed companies.

# Storm Score: weighted combination of distress signals
storm_scores <- dbGetQuery(con, "
  WITH company_signals AS (
    SELECT
      f.ticker,
      MAX(f.company_name) as company,
      AVG(p.is_revenue_declining / p.num_sentences * 100) as rev_declining,
      AVG(p.is_cost_cutting / p.num_sentences * 100) as cost_cutting,
      AVG(p.is_below_expectations / p.num_sentences * 100) as missing_exp,
      AVG(p.is_above_expectations / p.num_sentences * 100) -
        AVG(p.is_below_expectations / p.num_sentences * 100) as net_beat,
      AVG(p.is_cautious_or_hedging_tone / p.num_sentences * 100) as hedging
    FROM filings f
    JOIN primitives p USING (accession_number)
    WHERE f.form IN ('10-K', '10-Q')
      AND f.filing_date >= '2024-01-01'
      AND p.num_sentences BETWEEN 300 AND 5000
    GROUP BY f.ticker
    HAVING COUNT(*) >= 2
  )
  SELECT
    ticker,
    company,
    ROUND(rev_declining, 1) as rev_declining,
    ROUND(cost_cutting, 1) as cost_cutting,
    ROUND(missing_exp, 1) as missing_exp,
    ROUND(rev_declining + cost_cutting + missing_exp
          - (net_beat * 2) + (hedging / 3), 1) as storm_score
  FROM company_signals
  ORDER BY storm_score DESC
  LIMIT 20
")

print(storm_scores)
Formula rationale

Storm Score = Revenue Declining + Cost Cutting + Missing Expectations − (Net Beat × 2) + (Hedging / 3). Positive signals subtract; negative signals add. Scores above 100 indicate high distress.

06 Visualization

Create publication-ready charts with ggplot2.

Signal trend over time

# Plot a company's signal trajectory
ggplot(aapl_signal, aes(x = filing_date, y = signal_pct)) +
  geom_line(color = "#6366f1", linewidth = 1.2) +
  geom_point(color = "#6366f1", size = 3) +
  labs(
    title = "AAPL: Revenue Declining Signal",
    subtitle = "Percentage of filing sentences flagged",
    x = NULL,
    y = "Signal %"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold"),
    plot.subtitle = element_text(color = "gray50")
  )

Compare multiple companies

# Pull data for multiple tickers
multi_ticker <- dbGetQuery(con, "
  SELECT
    f.ticker,
    f.filing_date,
    p.is_revenue_declining / p.num_sentences * 100 as signal_pct
  FROM filings f
  JOIN primitives p USING (accession_number)
  WHERE f.ticker IN ('AAPL', 'MSFT', 'GOOGL', 'META')
    AND f.form IN ('10-K', '10-Q')
    AND f.filing_date >= '2020-01-01'
  ORDER BY f.filing_date
")

# Faceted comparison
ggplot(multi_ticker, aes(x = filing_date, y = signal_pct, color = ticker)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  facet_wrap(~ticker, scales = "free_y") +
  labs(title = "Revenue Declining Signal: Big Tech Comparison") +
  theme_minimal() +
  theme(legend.position = "none")

07 Signal Momentum

Calculate the velocity (rate of change) of a signal to detect turning points.

# Calculate quarter-over-quarter change in signal
signal_momentum <- dbGetQuery(con, "
  WITH signal_history AS (
    SELECT
      f.ticker,
      f.filing_date,
      p.is_revenue_declining / p.num_sentences * 100 as signal_pct,
      LAG(p.is_revenue_declining / p.num_sentences * 100)
        OVER (PARTITION BY f.ticker ORDER BY f.filing_date) as prev_signal
    FROM filings f
    JOIN primitives p USING (accession_number)
    WHERE f.form IN ('10-K', '10-Q')
      AND f.filing_date >= '2022-01-01'
      AND p.num_sentences > 200
  )
  SELECT
    ticker,
    filing_date,
    signal_pct,
    prev_signal,
    ROUND(signal_pct - prev_signal, 2) as delta,
    CASE
      WHEN signal_pct - prev_signal > 0.5 THEN 'Deteriorating'
      WHEN signal_pct - prev_signal < -0.5 THEN 'Improving'
      ELSE 'Stable'
    END as trend
  FROM signal_history
  WHERE prev_signal IS NOT NULL
    AND ticker = 'VFC'
  ORDER BY filing_date
")

print(signal_momentum)

Positive delta = signal increasing (getting worse for distress signals). Look for sustained positive momentum as an early warning.

08 Simple Backtest

Rank stocks by signal, form quintiles, and check forward returns.

# Create quintile portfolios based on signal strength
quintile_returns <- dbGetQuery(con, "
  WITH ranked AS (
    SELECT
      f.ticker,
      f.filing_date,
      p.is_revenue_declining / p.num_sentences as signal,
      NTILE(5) OVER (
        PARTITION BY DATE_TRUNC('quarter', f.filing_date)
        ORDER BY p.is_revenue_declining / p.num_sentences
      ) as quintile
    FROM filings f
    JOIN primitives p USING (accession_number)
    WHERE f.form IN ('10-K', '10-Q')
      AND f.filing_date >= '2022-01-01'
      AND p.num_sentences BETWEEN 300 AND 5000
  )
  SELECT
    quintile,
    COUNT(*) as n_filings,
    AVG(signal) as avg_signal
  FROM ranked
  GROUP BY quintile
  ORDER BY quintile
")

print(quintile_returns)
Next step: Add price data

Join with your price database to calculate 30/60/90-day forward returns per quintile. Quintile 1 (lowest distress) should outperform Quintile 5 (highest distress) if the signal has predictive power.

# Pseudo-code for return calculation (requires price data)
# returns <- quintile_data %>%
#   left_join(prices, by = c("ticker", "filing_date")) %>%
#   mutate(
#     fwd_30d = lead(price, 30) / price - 1,
#     fwd_90d = lead(price, 90) / price - 1
#   ) %>%
#   group_by(quintile) %>%
#   summarize(
#     mean_30d = mean(fwd_30d, na.rm = TRUE),
#     mean_90d = mean(fwd_90d, na.rm = TRUE)
#   )

Cleanup

Always disconnect when you're done.

# Close the connection
dbDisconnect(con, shutdown = TRUE)