SQL

Structured Query Language is the standard language for relational databases. It combines data definition, manipulation, and control.

Data Definition Language (DDL)

CREATE

CREATE TABLE users (
    id          SERIAL PRIMARY KEY,
    username    VARCHAR(50) UNIQUE NOT NULL,
    email       VARCHAR(255) UNIQUE NOT NULL,
    password_hash VARCHAR(255) NOT NULL,
    created_at  TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    is_active   BOOLEAN DEFAULT TRUE
);

CREATE TABLE posts (
    id          SERIAL PRIMARY KEY,
    user_id     INT NOT NULL REFERENCES users(id) ON DELETE CASCADE,
    title       VARCHAR(200) NOT NULL,
    body        TEXT,
    published   BOOLEAN DEFAULT FALSE,
    created_at  TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_posts_user ON posts(user_id);
CREATE INDEX idx_posts_created ON posts(created_at DESC);

ALTER

ALTER TABLE users ADD COLUMN bio TEXT;
ALTER TABLE users ALTER COLUMN username TYPE VARCHAR(100);
ALTER TABLE users DROP COLUMN bio;
ALTER TABLE users ADD CONSTRAINT check_email CHECK (email LIKE '%@%');

DROP and TRUNCATE

DROP TABLE IF EXISTS posts;       -- removes table and data
TRUNCATE TABLE posts;             -- removes all rows, keeps structure (faster than DELETE)

Data Manipulation Language (DML)

SELECT

-- Basic query
SELECT username, email FROM users WHERE is_active = TRUE;

-- With ordering and limit
SELECT * FROM posts ORDER BY created_at DESC LIMIT 10 OFFSET 20;

-- Distinct values
SELECT DISTINCT major FROM students;

INSERT

INSERT INTO users (username, email, password_hash)
VALUES ('alice', 'alice@mail.com', '$2b$12$...');

-- Insert multiple rows
INSERT INTO users (username, email, password_hash) VALUES
    ('bob', 'bob@mail.com', '$2b$12$...'),
    ('carol', 'carol@mail.com', '$2b$12$...');

-- Insert from SELECT (UPSERT with ON CONFLICT in PostgreSQL)
INSERT INTO users (username, email, password_hash)
VALUES ('alice', 'alice@new.com', '$2b$12$...')
ON CONFLICT (username) DO UPDATE SET email = EXCLUDED.email;

UPDATE

UPDATE users SET is_active = FALSE WHERE last_login < '2024-01-01';
UPDATE posts SET published = TRUE WHERE user_id = 1 AND published = FALSE;

DELETE

DELETE FROM posts WHERE published = FALSE AND created_at < '2024-01-01';
DELETE FROM users WHERE id = 42;

Joins

Inner Join

Only rows matching in both tables.

SELECT u.username, p.title
FROM users u
INNER JOIN posts p ON u.id = p.user_id;

Left (Outer) Join

All rows from the left table. NULLs for non-matching right side.

SELECT u.username, COUNT(p.id) AS post_count
FROM users u
LEFT JOIN posts p ON u.id = p.user_id
GROUP BY u.username;
-- Includes users with 0 posts

Right Join

All rows from the right table. NULLs for non-matching left side. Less common (can always rewrite as LEFT JOIN).

Full Outer Join

All rows from both tables. NULLs on either side for non-matches.

Cross Join

Cartesian product — every row from left × every row from right. Rarely useful without a WHERE clause.

SELECT * FROM colors CROSS JOIN sizes;

Self Join

Join a table to itself.

-- Find employees and their managers
SELECT e.name AS employee, m.name AS manager
FROM employees e
LEFT JOIN employees m ON e.manager_id = m.id;

Subqueries

Scalar Subquery

Returns a single value.

SELECT username, (SELECT COUNT(*) FROM posts WHERE user_id = users.id) AS post_count
FROM users;

Row Subquery

Returns a single row.

Table Subquery

Returns a table (used in FROM or with IN/EXISTS).

SELECT * FROM users WHERE id IN (SELECT user_id FROM posts WHERE published = TRUE);

Correlated Subquery

References the outer query. Executed once per outer row.

SELECT * FROM users u
WHERE EXISTS (SELECT 1 FROM posts p WHERE p.user_id = u.id AND p.published = TRUE);

EXISTS vs IN: EXISTS stops at the first match (often faster for correlated subqueries). IN materializes the subquery result.

Aggregation

Aggregate Functions

SELECT
    COUNT(*) AS total,
    COUNT(DISTINCT major) AS unique_majors,
    AVG(gpa) AS avg_gpa,
    MIN(gpa) AS min_gpa,
    MAX(gpa) AS max_gpa,
    SUM(credits) AS total_credits
FROM students;

GROUP BY

SELECT major, COUNT(*) AS count, AVG(gpa) AS avg_gpa
FROM students
GROUP BY major
HAVING AVG(gpa) > 3.0
ORDER BY avg_gpa DESC;

Execution order: FROM → WHERE → GROUP BY → HAVING → SELECT → ORDER BY → LIMIT.

HAVING vs WHERE: WHERE filters rows before grouping. HAVING filters groups after aggregation.

Window Functions

Perform calculations across a window of rows related to the current row — without collapsing rows (unlike GROUP BY).

SELECT
    username,
    score,
    ROW_NUMBER() OVER (ORDER BY score DESC) AS rank,
    RANK() OVER (ORDER BY score DESC) AS rank_with_ties,
    DENSE_RANK() OVER (ORDER BY score DESC) AS dense_rank,
    LAG(score) OVER (ORDER BY created_at) AS prev_score,
    LEAD(score) OVER (ORDER BY created_at) AS next_score,
    SUM(score) OVER (ORDER BY created_at ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total,
    AVG(score) OVER (PARTITION BY department ORDER BY created_at ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg,
    NTILE(4) OVER (ORDER BY score DESC) AS quartile
FROM scores;

PARTITION BY: Divide rows into groups (like GROUP BY but without collapsing).

ORDER BY: Define the ordering within each partition.

Frame: ROWS BETWEEN ... AND ... — defines which rows are included in the calculation.

Common Window Function Patterns

-- Running total
SUM(amount) OVER (ORDER BY date ROWS UNBOUNDED PRECEDING)

-- Moving average (last 7 rows)
AVG(value) OVER (ORDER BY date ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)

-- Percent of total
amount / SUM(amount) OVER () * 100 AS pct_of_total

-- Difference from previous
value - LAG(value) OVER (ORDER BY date) AS change

-- Top N per group
SELECT * FROM (
    SELECT *, ROW_NUMBER() OVER (PARTITION BY department ORDER BY salary DESC) AS rn
    FROM employees
) t WHERE rn <= 3;

Common Table Expressions (CTEs)

Named temporary result sets for readability and reuse.

WITH active_users AS (
    SELECT id, username FROM users WHERE is_active = TRUE
),
user_posts AS (
    SELECT u.username, COUNT(p.id) AS post_count
    FROM active_users u
    JOIN posts p ON u.id = p.user_id
    GROUP BY u.username
)
SELECT * FROM user_posts WHERE post_count > 10 ORDER BY post_count DESC;

Recursive CTEs

-- Organizational hierarchy
WITH RECURSIVE org_tree AS (
    -- Base case: top-level managers (no manager)
    SELECT id, name, manager_id, 0 AS level
    FROM employees WHERE manager_id IS NULL

    UNION ALL

    -- Recursive case: employees under someone in the tree
    SELECT e.id, e.name, e.manager_id, t.level + 1
    FROM employees e
    JOIN org_tree t ON e.manager_id = t.id
)
SELECT * FROM org_tree ORDER BY level, name;

Recursive CTEs enable graph traversal, hierarchical queries, and iterative computations in SQL.

Views

Regular Views

Virtual tables defined by a query. No data stored — query executed on access.

CREATE VIEW active_post_counts AS
SELECT u.username, COUNT(p.id) AS post_count
FROM users u
JOIN posts p ON u.id = p.user_id
WHERE u.is_active = TRUE
GROUP BY u.username;

SELECT * FROM active_post_counts WHERE post_count > 5;

Materialized Views

Store the result physically. Must be refreshed (manually or on schedule).

CREATE MATERIALIZED VIEW monthly_stats AS
SELECT date_trunc('month', created_at) AS month, COUNT(*) AS posts
FROM posts GROUP BY month;

REFRESH MATERIALIZED VIEW monthly_stats;

Use cases: Expensive aggregations, dashboards, data warehouse summaries.

Stored Procedures and Functions

CREATE FUNCTION get_user_posts(uid INT)
RETURNS TABLE(title VARCHAR, created_at TIMESTAMP) AS $$
    SELECT title, created_at FROM posts WHERE user_id = uid ORDER BY created_at DESC;
$$ LANGUAGE SQL;

SELECT * FROM get_user_posts(1);

Triggers

CREATE OR REPLACE FUNCTION update_modified_column()
RETURNS TRIGGER AS $$
BEGIN
    NEW.updated_at = NOW();
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER set_updated_at
BEFORE UPDATE ON users
FOR EACH ROW EXECUTE FUNCTION update_modified_column();

Applications in CS

Backend development: CRUD operations, complex queries, reporting.
Data analysis: Window functions, CTEs, aggregations for business intelligence.
Data engineering: ETL pipelines, data transformation, materialized views.
Performance tuning: Understanding SQL execution helps write efficient queries and design good schemas.
ORM understanding: Knowing SQL helps debug ORM-generated queries and understand their limitations.