KPIX: Berkeley rezoning affects our businesses
# CELL: Create SQL Queries for Berkeley Housing Analysis
print("📊 BERKELEY HOUSING DATASETTE QUERIES\n")
print("="*70)
import sqlite3
import pandas as pd
# First, let's check what columns we have
db_path = '/Users/johngage/berkeley-data/berkeley_housing_map.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("PRAGMA table_info(projects)")
columns = cursor.fetchall()
print("\n1️⃣ AVAILABLE COLUMNS IN PROJECTS TABLE:\n")
for col in columns:
print(f" • {col[1]} ({col[2]})")
conn.close()
print(f"\n{'='*70}")
# CELL: SQL Query Collection for Datasette
queries = {
"Query 1": {
"name": "Total Housing Units Overview",
"description": "Summary statistics of all housing projects",
"sql": """
SELECT
COUNT(*) as total_projects,
SUM(net_units) as total_units,
AVG(net_units) as avg_units_per_project,
MIN(net_units) as smallest_project,
MAX(net_units) as largest_project,
MIN(year) as earliest_year,
MAX(year) as latest_year
FROM projects
WHERE net_units > 0
"""
},
"Query 2": {
"name": "Projects by Size Category",
"description": "Breakdown of projects by unit count ranges",
"sql": """
SELECT
CASE
WHEN net_units >= 200 THEN '🔴 Large (200+ units)'
WHEN net_units >= 100 THEN '🟠 Medium-Large (100-199 units)'
WHEN net_units >= 50 THEN '🟡 Medium (50-99 units)'
WHEN net_units >= 20 THEN '🟢 Small-Medium (20-49 units)'
ELSE '🔵 Small (<20 units)'
END as size_category,
COUNT(*) as num_projects,
SUM(net_units) as total_units,
ROUND(AVG(net_units), 1) as avg_units,
ROUND(SUM(net_units) * 100.0 / (SELECT SUM(net_units) FROM projects), 1) as percent_of_total
FROM projects
WHERE net_units > 0
GROUP BY size_category
ORDER BY MIN(net_units) DESC
"""
},
"Query 3": {
"name": "Projects by Year and Status",
"description": "Timeline of housing development with status breakdown",
"sql": """
SELECT
year,
status,
COUNT(*) as projects,
SUM(net_units) as units,
ROUND(AVG(net_units), 1) as avg_units_per_project
FROM projects
WHERE year IS NOT NULL
GROUP BY year, status
ORDER BY year DESC, status
"""
},
"Query 4": {
"name": "Geographic Distribution (Top Streets)",
"description": "Which streets have the most housing development",
"sql": """
SELECT
SUBSTR(address_display, INSTR(address_display, ' ') + 1) as street_name,
COUNT(*) as num_projects,
SUM(net_units) as total_units,
ROUND(AVG(net_units), 1) as avg_units,
GROUP_CONCAT(CAST(net_units AS TEXT) || ' units', ', ') as project_sizes
FROM projects
WHERE address_display IS NOT NULL
GROUP BY street_name
HAVING COUNT(*) >= 2
ORDER BY total_units DESC
LIMIT 20
"""
}
}
# Print all queries
for key, query in queries.items():
print(f"\n{'='*70}")
print(f"\n{query['name']}")
print(f"{'='*70}")
print(f"\n{query['description']}\n")
print("```sql")
print(query['sql'].strip())
print("```")
print(f"\n{'='*70}")
# CELL: Advanced SQL Queries (with joins to other tables)
advanced_queries = {
"Query 5": {
"name": "Projects with Business Licenses at Address",
"description": "Join housing projects with business license data",
"requires": "berkeley.db with licenses table",
"sql": """
-- NOTE: Requires joining to berkeley.db licenses table
SELECT
p.address_display,
p.net_units,
p.year,
p.status,
COUNT(DISTINCT l.recordid) as num_businesses,
GROUP_CONCAT(DISTINCT l.busdesc, '; ') as business_types
FROM projects p
LEFT JOIN licenses l ON p.apn = l.apn_normalized
GROUP BY p.address_display, p.net_units, p.year, p.status
HAVING num_businesses > 0
ORDER BY num_businesses DESC, p.net_units DESC
"""
},
"Query 6": {
"name": "Projects by Neighborhood/Zip Code",
"description": "Geographic clustering of development",
"requires": "zipcode or neighborhood field",
"sql": """
-- If you have zipcode in projects table:
SELECT
SUBSTR(address_display, -5) as estimated_area,
COUNT(*) as projects,
SUM(net_units) as total_units,
ROUND(AVG(net_units), 1) as avg_units
FROM projects
GROUP BY estimated_area
ORDER BY total_units DESC
LIMIT 15
"""
},
"Query 7": {
"name": "Development Timeline - Year over Year",
"description": "Trend analysis of housing production",
"sql": """
SELECT
year,
COUNT(*) as projects,
SUM(net_units) as units_this_year,
SUM(SUM(net_units)) OVER (ORDER BY year) as cumulative_units,
ROUND(AVG(net_units), 1) as avg_project_size,
ROUND((SUM(net_units) - LAG(SUM(net_units)) OVER (ORDER BY year)) * 100.0 /
LAG(SUM(net_units)) OVER (ORDER BY year), 1) as percent_change_from_prior_year
FROM projects
WHERE year IS NOT NULL
GROUP BY year
ORDER BY year
"""
},
"Query 8": {
"name": "Large Projects Deep Dive",
"description": "Detailed view of projects with 50+ units",
"sql": """
SELECT
address_display,
net_units,
year,
status,
permits,
apn,
ROUND(latitude, 6) as lat,
ROUND(longitude, 6) as lon,
'https://www.google.com/maps/search/?api=1&query=' ||
REPLACE(address_display, ' ', '+') || '+Berkeley+CA' as google_maps_link
FROM projects
WHERE net_units >= 50
ORDER BY net_units DESC
"""
}
}
# Print advanced queries
for key, query in advanced_queries.items():
print(f"\n{'='*70}")
print(f"\n{query['name']}")
print(f"{'='*70}")
print(f"\n{query['description']}")
if 'requires' in query:
print(f"⚠️ Requires: {query['requires']}")
print()
print("```sql")
print(query['sql'].strip())
print("```")
print(f"\n{'='*70}")
# CELL: Save SQL Query Reference
query_reference = """# Berkeley Housing Datasette SQL Queries
## Query 1: Total Housing Units Overview
Summary statistics of all housing projects
```sql
SELECT
COUNT(*) as total_projects,
SUM(net_units) as total_units,
AVG(net_units) as avg_units_per_project,
MIN(net_units) as smallest_project,
MAX(net_units) as largest_project,
MIN(year) as earliest_year,
MAX(year) as latest_year
FROM projects
WHERE net_units > 0
```
## Query 2: Projects by Size Category
Breakdown of projects by unit count ranges
```sql
SELECT
CASE
WHEN net_units >= 200 THEN '🔴 Large (200+ units)'
WHEN net_units >= 100 THEN '🟠 Medium-Large (100-199 units)'
WHEN net_units >= 50 THEN '🟡 Medium (50-99 units)'
WHEN net_units >= 20 THEN '🟢 Small-Medium (20-49 units)'
ELSE '🔵 Small (<20 units)'
END as size_category,
COUNT(*) as num_projects,
SUM(net_units) as total_units,
ROUND(AVG(net_units), 1) as avg_units,
ROUND(SUM(net_units) * 100.0 / (SELECT SUM(net_units) FROM projects), 1) as percent_of_total
FROM projects
WHERE net_units > 0
GROUP BY size_category
ORDER BY MIN(net_units) DESC
```
## Query 3: Projects by Year and Status
Timeline of housing development with status breakdown
```sql
SELECT
year,
status,
COUNT(*) as projects,
SUM(net_units) as units,
ROUND(AVG(net_units), 1) as avg_units_per_project
FROM projects
WHERE year IS NOT NULL
GROUP BY year, status
ORDER BY year DESC, status
```
## Query 4: Top Streets for Development
Which streets have the most housing development
```sql
SELECT
SUBSTR(address_display, INSTR(address_display, ' ') + 1) as street_name,
COUNT(*) as num_projects,
SUM(net_units) as total_units,
ROUND(AVG(net_units), 1) as avg_units
FROM projects
WHERE address_display IS NOT NULL
GROUP BY street_name
HAVING COUNT(*) >= 2
ORDER BY total_units DESC
LIMIT 20
```
## Query 5: Year-over-Year Growth
Trend analysis with year-over-year comparison
```sql
SELECT
year,
COUNT(*) as projects,
SUM(net_units) as units_this_year,
SUM(SUM(net_units)) OVER (ORDER BY year) as cumulative_units,
ROUND((SUM(net_units) - LAG(SUM(net_units)) OVER (ORDER BY year)) * 100.0 /
LAG(SUM(net_units)) OVER (ORDER BY year), 1) as pct_change
FROM projects
WHERE year IS NOT NULL
GROUP BY year
ORDER BY year
```
## Query 6: Projects with Map Links
Large projects with Google Maps links
```sql
SELECT
address_display,
net_units,
year,
status,
'https://www.google.com/maps/search/?api=1&query=' ||
REPLACE(address_display, ' ', '+') || '+Berkeley+CA' as map_link
FROM projects
WHERE net_units >= 50
ORDER BY net_units DESC
```
---
## Using These Queries
1. Visit: https://berkeley-housing.fly.dev/berkeley_housing_map/projects
2. Click "View and edit SQL"
3. Copy and paste any query
4. Click "Run SQL"
5. Export results as CSV/JSON if needed
"""
# Save to file
reference_path = '/Users/johngage/berkeley-data/DATASETTE_QUERIES.md'
with open(reference_path, 'w') as f:
f.write(query_reference)
print(f"✅ Saved query reference to: {reference_path}")
print(f"\nThis file contains all SQL queries ready to use in Datasette!")