SQL queries for Datasette

# CELL: Create SQL Queries for Berkeley Housing Analysis

print("📊 BERKELEY HOUSING DATASETTE QUERIES\n")

print("="*70)

import sqlite3

import pandas as pd

# First, let's check what columns we have

db_path = '/Users/johngage/berkeley-data/berkeley_housing_map.db'

conn = sqlite3.connect(db_path)

cursor = conn.cursor()

cursor.execute("PRAGMA table_info(projects)")

columns = cursor.fetchall()

print("\n1️⃣ AVAILABLE COLUMNS IN PROJECTS TABLE:\n")

for col in columns:

print(f" • {col[1]} ({col[2]})")

conn.close()

print(f"\n{'='*70}")

# CELL: SQL Query Collection for Datasette

queries = {

"Query 1": {

"name": "Total Housing Units Overview",

"description": "Summary statistics of all housing projects",

"sql": """

SELECT

COUNT(*) as total_projects,

SUM(net_units) as total_units,

AVG(net_units) as avg_units_per_project,

MIN(net_units) as smallest_project,

MAX(net_units) as largest_project,

MIN(year) as earliest_year,

MAX(year) as latest_year

FROM projects

WHERE net_units > 0

"""

},

"Query 2": {

"name": "Projects by Size Category",

"description": "Breakdown of projects by unit count ranges",

"sql": """

SELECT

CASE

WHEN net_units >= 200 THEN '🔴 Large (200+ units)'

WHEN net_units >= 100 THEN '🟠 Medium-Large (100-199 units)'

WHEN net_units >= 50 THEN '🟡 Medium (50-99 units)'

WHEN net_units >= 20 THEN '🟢 Small-Medium (20-49 units)'

ELSE '🔵 Small (<20 units)'

END as size_category,

COUNT(*) as num_projects,

SUM(net_units) as total_units,

ROUND(AVG(net_units), 1) as avg_units,

ROUND(SUM(net_units) * 100.0 / (SELECT SUM(net_units) FROM projects), 1) as percent_of_total

FROM projects

WHERE net_units > 0

GROUP BY size_category

ORDER BY MIN(net_units) DESC

"""

},

"Query 3": {

"name": "Projects by Year and Status",

"description": "Timeline of housing development with status breakdown",

"sql": """

SELECT

year,

status,

COUNT(*) as projects,

SUM(net_units) as units,

ROUND(AVG(net_units), 1) as avg_units_per_project

FROM projects

WHERE year IS NOT NULL

GROUP BY year, status

ORDER BY year DESC, status

"""

},

"Query 4": {

"name": "Geographic Distribution (Top Streets)",

"description": "Which streets have the most housing development",

"sql": """

SELECT

SUBSTR(address_display, INSTR(address_display, ' ') + 1) as street_name,

COUNT(*) as num_projects,

SUM(net_units) as total_units,

ROUND(AVG(net_units), 1) as avg_units,

GROUP_CONCAT(CAST(net_units AS TEXT) || ' units', ', ') as project_sizes

FROM projects

WHERE address_display IS NOT NULL

GROUP BY street_name

HAVING COUNT(*) >= 2

ORDER BY total_units DESC

LIMIT 20

"""

}

# Print all queries

for key, query in queries.items():

print(f"\n{'='*70}")

print(f"\n{query['name']}")

print(f"{'='*70}")

print(f"\n{query['description']}\n")

print("```sql")

print(query['sql'].strip())

print("```")

print(f"\n{'='*70}")

# CELL: Advanced SQL Queries (with joins to other tables)

advanced_queries = {

"Query 5": {

"name": "Projects with Business Licenses at Address",

"description": "Join housing projects with business license data",

"requires": "berkeley.db with licenses table",

"sql": """

-- NOTE: Requires joining to berkeley.db licenses table

SELECT

p.address_display,

p.net_units,

p.year,

p.status,

COUNT(DISTINCT l.recordid) as num_businesses,

GROUP_CONCAT(DISTINCT l.busdesc, '; ') as business_types

FROM projects p

LEFT JOIN licenses l ON p.apn = l.apn_normalized

GROUP BY p.address_display, p.net_units, p.year, p.status

HAVING num_businesses > 0

ORDER BY num_businesses DESC, p.net_units DESC

"""

},

"Query 6": {

"name": "Projects by Neighborhood/Zip Code",

"description": "Geographic clustering of development",

"requires": "zipcode or neighborhood field",

"sql": """

-- If you have zipcode in projects table:

SELECT

SUBSTR(address_display, -5) as estimated_area,

COUNT(*) as projects,

SUM(net_units) as total_units,

ROUND(AVG(net_units), 1) as avg_units

FROM projects

GROUP BY estimated_area

ORDER BY total_units DESC

LIMIT 15

"""

},

"Query 7": {

"name": "Development Timeline - Year over Year",

"description": "Trend analysis of housing production",

"sql": """

SELECT

year,

COUNT(*) as projects,

SUM(net_units) as units_this_year,

SUM(SUM(net_units)) OVER (ORDER BY year) as cumulative_units,

ROUND(AVG(net_units), 1) as avg_project_size,

ROUND((SUM(net_units) - LAG(SUM(net_units)) OVER (ORDER BY year)) * 100.0 /

LAG(SUM(net_units)) OVER (ORDER BY year), 1) as percent_change_from_prior_year

FROM projects

WHERE year IS NOT NULL

GROUP BY year

ORDER BY year

"""

},

"Query 8": {

"name": "Large Projects Deep Dive",

"description": "Detailed view of projects with 50+ units",

"sql": """

SELECT

address_display,

net_units,

year,

status,

permits,

apn,

ROUND(latitude, 6) as lat,

ROUND(longitude, 6) as lon,

'https://www.google.com/maps/search/?api=1&query=' ||

REPLACE(address_display, ' ', '+') || '+Berkeley+CA' as google_maps_link

FROM projects

WHERE net_units >= 50

ORDER BY net_units DESC

"""

}

# Print advanced queries

for key, query in advanced_queries.items():

print(f"\n{'='*70}")

print(f"\n{query['name']}")

print(f"{'='*70}")

print(f"\n{query['description']}")

if 'requires' in query:

print(f"⚠️ Requires: {query['requires']}")

print()

print("```sql")

print(query['sql'].strip())

print("```")

print(f"\n{'='*70}")

# CELL: Save SQL Query Reference

query_reference = """# Berkeley Housing Datasette SQL Queries

## Query 1: Total Housing Units Overview

Summary statistics of all housing projects

```sql

SELECT

COUNT(*) as total_projects,

SUM(net_units) as total_units,

AVG(net_units) as avg_units_per_project,

MIN(net_units) as smallest_project,

MAX(net_units) as largest_project,

MIN(year) as earliest_year,

MAX(year) as latest_year

FROM projects

WHERE net_units > 0

```

## Query 2: Projects by Size Category

Breakdown of projects by unit count ranges

```sql

SELECT

CASE

WHEN net_units >= 200 THEN '🔴 Large (200+ units)'

WHEN net_units >= 100 THEN '🟠 Medium-Large (100-199 units)'

WHEN net_units >= 50 THEN '🟡 Medium (50-99 units)'

WHEN net_units >= 20 THEN '🟢 Small-Medium (20-49 units)'

ELSE '🔵 Small (<20 units)'

END as size_category,

COUNT(*) as num_projects,

SUM(net_units) as total_units,

ROUND(AVG(net_units), 1) as avg_units,

ROUND(SUM(net_units) * 100.0 / (SELECT SUM(net_units) FROM projects), 1) as percent_of_total

FROM projects

WHERE net_units > 0

GROUP BY size_category

ORDER BY MIN(net_units) DESC

```

## Query 3: Projects by Year and Status

Timeline of housing development with status breakdown

```sql

SELECT

year,

status,

COUNT(*) as projects,

SUM(net_units) as units,

ROUND(AVG(net_units), 1) as avg_units_per_project

FROM projects

WHERE year IS NOT NULL

GROUP BY year, status

ORDER BY year DESC, status

```

## Query 4: Top Streets for Development

Which streets have the most housing development

```sql

SELECT

SUBSTR(address_display, INSTR(address_display, ' ') + 1) as street_name,

COUNT(*) as num_projects,

SUM(net_units) as total_units,

ROUND(AVG(net_units), 1) as avg_units

FROM projects

WHERE address_display IS NOT NULL

GROUP BY street_name

HAVING COUNT(*) >= 2

ORDER BY total_units DESC

LIMIT 20

```

## Query 5: Year-over-Year Growth

Trend analysis with year-over-year comparison

```sql

SELECT

year,

COUNT(*) as projects,

SUM(net_units) as units_this_year,

SUM(SUM(net_units)) OVER (ORDER BY year) as cumulative_units,

ROUND((SUM(net_units) - LAG(SUM(net_units)) OVER (ORDER BY year)) * 100.0 /

LAG(SUM(net_units)) OVER (ORDER BY year), 1) as pct_change

FROM projects

WHERE year IS NOT NULL

GROUP BY year

ORDER BY year

```

## Query 6: Projects with Map Links

Large projects with Google Maps links

```sql

SELECT

address_display,

net_units,

year,

status,

'https://www.google.com/maps/search/?api=1&query=' ||

REPLACE(address_display, ' ', '+') || '+Berkeley+CA' as map_link

FROM projects

WHERE net_units >= 50

ORDER BY net_units DESC

```

---

## Using These Queries

1. Visit: https://berkeley-housing.fly.dev/berkeley_housing_map/projects

2. Click "View and edit SQL"

3. Copy and paste any query

4. Click "Run SQL"

5. Export results as CSV/JSON if needed

"""

# Save to file

reference_path = '/Users/johngage/berkeley-data/DATASETTE_QUERIES.md'

with open(reference_path, 'w') as f:

f.write(query_reference)

print(f"✅ Saved query reference to: {reference_path}")

print(f"\nThis file contains all SQL queries ready to use in Datasette!")

SQL queries for Datasette

Extend queries to show all project progress annotated by periodic tests of monitoring framework