.01
.02
-- 创建一个年龄分段的表
CREATE OR REPLACE TABLE age_split AS
(SELECT * FROM generate_series(25, 90, 5) AS age);
ALTER TABLE age_split
RENAME COLUMN generate_series TO age;
-- 按年龄分割数据并计算Gini不纯度
WITH split AS (
SELECT CASE WHEN age < 40 THEN 'left' ELSE 'right' END as node, buy
FROM insurance
)
SELECT node,
COUNT(*) as total_customers,
SUM(buy) as total_buys,
SUM(1 - buy) as total_not_buys,
(1 - POWER(SUM(buy) / COUNT(*), 2) - POWER(SUM(1 - buy) / COUNT(*), 2)) as gini
FROM split
GROUP BY node;
WITH total_customers AS (
SELECT COUNT(*) AS total FROM insurance
),
gini_calc AS (
SELECT a.age AS split_point,
SUM(CASE WHEN i.age < a.age THEN 1 ELSE 0 END) AS left_count,
SUM(CASE WHEN i.age < a.age AND i.buy = 1 THEN 1 ELSE 0 END) AS left_buy,
SUM(CASE WHEN i.age >= a.age THEN 1 ELSE 0 END) AS right_count,
SUM(CASE WHEN i.age >= a.age AND i.buy = 1 THEN 1 ELSE 0 END) AS right_buy
FROM age_split a
CROSS JOIN insurance i
GROUP BY a.age
),
gini_values AS (
SELECT split_point,
(1 - POWER(left_buy * 1.0 / left_count, 2) - POWER((left_count - left_buy) * 1.0 / left_count, 2)) as left_gini,
(1 - POWER(right_buy * 1.0 / right_count, 2) - POWER((right_count - right_buy) * 1.0 / right_count, 2)) as right_gini,
((left_count * left_gini) + (right_count * right_gini)) / total AS weighted_gini
FROM gini_calc, total_customers total
)
SELECT split_point, weighted_gini
FROM gini_values
ORDER BY weighted_gini
LIMIT 1;
SELECT CASE
WHEN age < 40 THEN
CASE WHEN income < 80000 THEN 0 ELSE 1 END
ELSE
1
END AS predicted_buy,
buy AS actual_buy
FROM insurance;
WITH predictions AS (
SELECT CASE
WHEN age < 40 THEN CASE WHEN income < 80000 THEN 0 ELSE 1 END
ELSE 1
END AS predicted_buy,
buy AS actual_buy
FROM insurance
)
SELECT predicted_buy, actual_buy, COUNT(*) AS count
FROM predictions
GROUP BY predicted_buy, actual_buy
ORDER BY predicted_buy, actual_buy;
优化特征选择:增加如职业、家庭等影响购买决策的特征。 调整超参数:使用不同分裂准则、最小叶节点数等,来提升分类准确性。 应用更复杂的模型:探索其他机器学习算法,如随机森林、提升树等。
.03
参考: