Practical recipes for working with Kaleidoscope primitives. Copy, paste, adapt.
The heavy lifting here is SQL — DuckDB does the work. R is just the wrapper for execution and visualization. These same queries work in Python (duckdb.connect()), Julia, or any language with a DuckDB driver.
Load the required libraries and connect to the DuckDB database containing primitives.
# Install packages (first time only)
# install.packages(c("duckdb", "dplyr", "ggplot2", "tidyr"))
# Load libraries
library(duckdb)
library(dplyr)
library(ggplot2)
library(tidyr)
# Connect to DuckDB (read-only)
con <- dbConnect(
duckdb(),
"path/to/primitives_snapshot.duckdb",
read_only = TRUE
)
# List available tables
dbListTables(con)
Always use read_only = TRUE when analyzing. It's faster and prevents accidental writes.
Understand the schema and available primitives before diving in.
# Filings table structure
dbGetQuery(con, "DESCRIBE filings")
# Primitives table structure
dbGetQuery(con, "DESCRIBE primitives")
dbGetQuery(con, "
SELECT form, COUNT(*) as n, MIN(filing_date) as earliest, MAX(filing_date) as latest
FROM filings
GROUP BY form
ORDER BY n DESC
")
# Get column names from primitives table (each column is a primitive)
primitives <- dbGetQuery(con, "SELECT * FROM primitives LIMIT 1")
primitive_names <- names(primitives)
print(primitive_names)
Pull one primitive for a specific ticker and analyze its trend over time.
# Get revenue declining signal for Apple
aapl_signal <- dbGetQuery(con, "
SELECT
f.filing_date,
f.form,
p.is_revenue_declining,
p.num_sentences
FROM filings f
JOIN primitives p USING (accession_number)
WHERE f.ticker = 'AAPL'
AND f.form IN ('10-K', '10-Q')
ORDER BY f.filing_date
")
# Normalize by sentence count (percentage of filing)
aapl_signal <- aapl_signal %>%
mutate(
signal_pct = is_revenue_declining / num_sentences * 100
)
# View the trend
print(aapl_signal)
Raw counts vary with filing length. A 10-K has ~3,000 sentences vs ~800 for a 10-Q. Always normalize by num_sentences for fair comparison.
Compare a company's signal to the market or sector baseline using z-scores.
# Calculate z-scores vs all companies in same quarter
z_scores <- dbGetQuery(con, "
WITH quarterly_stats AS (
SELECT
DATE_TRUNC('quarter', f.filing_date) as quarter,
AVG(p.is_revenue_declining / p.num_sentences) as mean_signal,
STDDEV(p.is_revenue_declining / p.num_sentences) as std_signal
FROM filings f
JOIN primitives p USING (accession_number)
WHERE f.form IN ('10-K', '10-Q')
AND p.num_sentences > 200
GROUP BY quarter
)
SELECT
f.ticker,
f.filing_date,
DATE_TRUNC('quarter', f.filing_date) as quarter,
p.is_revenue_declining / p.num_sentences as raw_signal,
qs.mean_signal,
qs.std_signal,
(p.is_revenue_declining / p.num_sentences - qs.mean_signal) / NULLIF(qs.std_signal, 0) as z_score
FROM filings f
JOIN primitives p USING (accession_number)
JOIN quarterly_stats qs ON DATE_TRUNC('quarter', f.filing_date) = qs.quarter
WHERE f.ticker = 'XRX'
AND f.form IN ('10-K', '10-Q')
ORDER BY f.filing_date
")
print(z_scores)
A z-score of +2.8 means XRX's "revenue declining" language is 2.8 standard deviations above the market average — a strong distress signal.
Combine multiple primitives into a single "Storm Score" for identifying distressed companies.
# Storm Score: weighted combination of distress signals
storm_scores <- dbGetQuery(con, "
WITH company_signals AS (
SELECT
f.ticker,
MAX(f.company_name) as company,
AVG(p.is_revenue_declining / p.num_sentences * 100) as rev_declining,
AVG(p.is_cost_cutting / p.num_sentences * 100) as cost_cutting,
AVG(p.is_below_expectations / p.num_sentences * 100) as missing_exp,
AVG(p.is_above_expectations / p.num_sentences * 100) -
AVG(p.is_below_expectations / p.num_sentences * 100) as net_beat,
AVG(p.is_cautious_or_hedging_tone / p.num_sentences * 100) as hedging
FROM filings f
JOIN primitives p USING (accession_number)
WHERE f.form IN ('10-K', '10-Q')
AND f.filing_date >= '2024-01-01'
AND p.num_sentences BETWEEN 300 AND 5000
GROUP BY f.ticker
HAVING COUNT(*) >= 2
)
SELECT
ticker,
company,
ROUND(rev_declining, 1) as rev_declining,
ROUND(cost_cutting, 1) as cost_cutting,
ROUND(missing_exp, 1) as missing_exp,
ROUND(rev_declining + cost_cutting + missing_exp
- (net_beat * 2) + (hedging / 3), 1) as storm_score
FROM company_signals
ORDER BY storm_score DESC
LIMIT 20
")
print(storm_scores)
Storm Score = Revenue Declining + Cost Cutting + Missing Expectations − (Net Beat × 2) + (Hedging / 3). Positive signals subtract; negative signals add. Scores above 100 indicate high distress.
Create publication-ready charts with ggplot2.
# Plot a company's signal trajectory
ggplot(aapl_signal, aes(x = filing_date, y = signal_pct)) +
geom_line(color = "#6366f1", linewidth = 1.2) +
geom_point(color = "#6366f1", size = 3) +
labs(
title = "AAPL: Revenue Declining Signal",
subtitle = "Percentage of filing sentences flagged",
x = NULL,
y = "Signal %"
) +
theme_minimal(base_size = 14) +
theme(
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(color = "gray50")
)
# Pull data for multiple tickers
multi_ticker <- dbGetQuery(con, "
SELECT
f.ticker,
f.filing_date,
p.is_revenue_declining / p.num_sentences * 100 as signal_pct
FROM filings f
JOIN primitives p USING (accession_number)
WHERE f.ticker IN ('AAPL', 'MSFT', 'GOOGL', 'META')
AND f.form IN ('10-K', '10-Q')
AND f.filing_date >= '2020-01-01'
ORDER BY f.filing_date
")
# Faceted comparison
ggplot(multi_ticker, aes(x = filing_date, y = signal_pct, color = ticker)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
facet_wrap(~ticker, scales = "free_y") +
labs(title = "Revenue Declining Signal: Big Tech Comparison") +
theme_minimal() +
theme(legend.position = "none")
Calculate the velocity (rate of change) of a signal to detect turning points.
# Calculate quarter-over-quarter change in signal
signal_momentum <- dbGetQuery(con, "
WITH signal_history AS (
SELECT
f.ticker,
f.filing_date,
p.is_revenue_declining / p.num_sentences * 100 as signal_pct,
LAG(p.is_revenue_declining / p.num_sentences * 100)
OVER (PARTITION BY f.ticker ORDER BY f.filing_date) as prev_signal
FROM filings f
JOIN primitives p USING (accession_number)
WHERE f.form IN ('10-K', '10-Q')
AND f.filing_date >= '2022-01-01'
AND p.num_sentences > 200
)
SELECT
ticker,
filing_date,
signal_pct,
prev_signal,
ROUND(signal_pct - prev_signal, 2) as delta,
CASE
WHEN signal_pct - prev_signal > 0.5 THEN 'Deteriorating'
WHEN signal_pct - prev_signal < -0.5 THEN 'Improving'
ELSE 'Stable'
END as trend
FROM signal_history
WHERE prev_signal IS NOT NULL
AND ticker = 'VFC'
ORDER BY filing_date
")
print(signal_momentum)
Positive delta = signal increasing (getting worse for distress signals). Look for sustained positive momentum as an early warning.
Rank stocks by signal, form quintiles, and check forward returns.
# Create quintile portfolios based on signal strength
quintile_returns <- dbGetQuery(con, "
WITH ranked AS (
SELECT
f.ticker,
f.filing_date,
p.is_revenue_declining / p.num_sentences as signal,
NTILE(5) OVER (
PARTITION BY DATE_TRUNC('quarter', f.filing_date)
ORDER BY p.is_revenue_declining / p.num_sentences
) as quintile
FROM filings f
JOIN primitives p USING (accession_number)
WHERE f.form IN ('10-K', '10-Q')
AND f.filing_date >= '2022-01-01'
AND p.num_sentences BETWEEN 300 AND 5000
)
SELECT
quintile,
COUNT(*) as n_filings,
AVG(signal) as avg_signal
FROM ranked
GROUP BY quintile
ORDER BY quintile
")
print(quintile_returns)
Join with your price database to calculate 30/60/90-day forward returns per quintile. Quintile 1 (lowest distress) should outperform Quintile 5 (highest distress) if the signal has predictive power.
# Pseudo-code for return calculation (requires price data)
# returns <- quintile_data %>%
# left_join(prices, by = c("ticker", "filing_date")) %>%
# mutate(
# fwd_30d = lead(price, 30) / price - 1,
# fwd_90d = lead(price, 90) / price - 1
# ) %>%
# group_by(quintile) %>%
# summarize(
# mean_30d = mean(fwd_30d, na.rm = TRUE),
# mean_90d = mean(fwd_90d, na.rm = TRUE)
# )
Always disconnect when you're done.
# Close the connection
dbDisconnect(con, shutdown = TRUE)