# 数据库设计与实现
数据库使用 MySQL.
# 基础数据表
因为系统没有实现管理员系统, 所以 legal
字段没有发挥实际的作用, 但是应该被正确地设置.
spider
字段在爬虫中起效, 本质上是原来今日头条的数据主键值.
# Customer 用户表
cus_id INT UNSIGNED NOT NULL auto_increment,
cus_name VARCHAR(64) UNIQUE,
cus_pass VARCHAR(255),
-- 爬虫中用于识别用户
cus_spider VARCHAR(64) default '',
-- 用户头像的 url
cus_avatar_url VARCHAR(255) default 'http://localhost:8080/img/Man.png',
-- 用户的个人描述
cus_style VARCHAR(255) default '这个人很懒, 什么都没写',
-- cus_gender 为 0 时性别未知, 为 1 时为男, 为 -1 时为女
cus_gender TINYINT DEFAULT 0,
-- 用户的创建时间
cus_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-- cus_legal 为 0 时待审核, 为 1 时合法, 为 -1 不合法
cus_legal TINYINT default 0,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
系统不允许用户创建同名用户. 这张表也就前四个字段比较有用, 剩下的基本都是残废.
# Article 新闻表
art_id INT UNSIGNED NOT NULL auto_increment,
art_title VARCHAR(255) default '',
art_content TEXT,
-- 在爬虫中分辨文章
art_spider VARCHAR(64) default '',
-- 文章的分类
art_type VARCHAR(32),
-- 文章的标签 应该以 & 分隔
art_tags VARCHAR(128) default '',
-- 文章缩略图的信息
art_image_url VARCHAR(255) default '',
art_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
art_legal tinyint default 0,
art_cus_id INT UNSIGNED,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
这里 art_tags
计划使用来储存新闻的关键词, 后来没用上.
# Comment 评论表
com_id INT UNSIGNED NOT NULL auto_increment,
com_content TEXT,
com_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
com_legal tinyint default 0,
-- 爬虫过程中的评论标识
com_spider varchar(64) default '',
com_cus_id INT UNSIGNED,
com_art_id INT UNSIGNED,
1
2
3
4
5
6
7
8
2
3
4
5
6
7
8
# Reply 回复表
rep_id INT UNSIGNED NOT NULL auto_increment,
rep_content TEXT,
-- 回复的类型, 0 是对评论的回复, 1 是对回复的回复
rep_type tinyint default 0,
rep_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
rep_legal tinyint default 0,
-- 爬虫过程中的评论标识
rep_spider varchar(64) default '',
rep_cus_id INT UNSIGNED,
rep_art_id INT UNSIGNED,
rep_com_id INT UNSIGNED,
rep_rep_id INT UNSIGNED,
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
回复需要考虑对评论的回复与对回复的回复. 当 rep_type
为 0 时, rep_rep_id
将为空.
# 数据统计与行为记录表
# CusBehaviorRecord 用户行为记录表
这是整个数据库中最复杂的表, 记录了所有的用户行为.
cbr_id INT UNSIGNED NOT NULL auto_increment,
-- 用户 攻
cbr_cus_id_from INT UNSIGNED,
-- 用户 受
cbr_cus_id_to INT UNSIGNED,
-- 行为类别
cbr_behavior INT default 0,
-- 行为时间
cbr_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-- 行为发生的文章 ID
cbr_art_id INT UNSIGNED,
-- 行为发生的位置代号, 0: 无发生, 1: 文章, 2: 评论, 3:回复
cbr_type TINYINT DEFAULT 0,
-- target ID
cbr_target_id INT UNSIGNED,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# ArtFeatureCount 新闻特征统计记录表
这张表统计了每篇新闻下的用户行为数据.
afc_like_num INT UNSIGNED default 0,
afc_dislike_num INT UNSIGNED default 0,
afc_com_num INT UNSIGNED default 0,
afc_rep_num INT UNSIGNED default 0,
afc_read_num INT UNSIGNED default 0,
afc_art_time TIMESTAMP,
1
2
3
4
5
6
2
3
4
5
6
其中, afc_like_num
与 afc_dislike_num
即新闻的点赞与点踩数量没有被使用.
# CusFeatureCount 用户特征统计记录表
这张表记录了每个用户在 18 种不同类别下的用户行为数量.
cfc_news_society INT UNSIGNED default 0,
cfc_news_entertainment INT UNSIGNED default 0,
cfc_news_tech INT UNSIGNED default 0,
cfc_news_military INT UNSIGNED default 0,
cfc_news_sports INT UNSIGNED default 0,
cfc_news_finance INT UNSIGNED default 0,
cfc_news_world INT UNSIGNED default 0,
cfc_news_fashion INT UNSIGNED default 0,
cfc_news_travel INT UNSIGNED default 0,
cfc_news_discovery INT UNSIGNED default 0,
cfc_news_baby INT UNSIGNED default 0,
cfc_news_regimen INT UNSIGNED default 0,
cfc_news_story INT UNSIGNED default 0,
cfc_news_essay INT UNSIGNED default 0,
cfc_news_game INT UNSIGNED default 0,
cfc_news_history INT UNSIGNED default 0,
cfc_news_food INT UNSIGNED default 0,
cfc_news_car INT UNSIGNED default 0,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# 数据视图
这个我要非常感谢我同学也是我同事 -- "鲁大师" 的建议, 利用视图来计算新闻的排分. 系统只设计了两个数据视图, 实际上需要更多以适配不同的推荐场景.
# ArtScoreList 通用新闻排分表
DROP VIEW IF EXISTS NR.ArtScoreList;
create view NR.ArtScoreList(asl_art_id, asl_art_score) as
select
afc_art_id,
timestampdiff(HOUR,now(),afc_art_time)*20 + cast(afc_like_num as signed)*4 +
cast(afc_dislike_num as signed)*(-6) + cast(afc_com_num as signed)*3+
cast(afc_read_num as signed )*1 + cast(afc_rep_num as signed)*2 as afc_art_socre
from NR.ArtFeatureCount
order by afc_art_socre desc;
1
2
3
4
5
6
7
8
9
2
3
4
5
6
7
8
9
# ArtTimeList 热点新闻排分表
DROP VIEW IF EXISTS NR.ArtTimeList;
create view NR.ArtTimeList(atl_art_id, atl_art_score) as
SELECT
afc_art_id,
timestampdiff(HOUR,now(),afc_art_time)*32 + cast(afc_like_num as signed)*4 +
cast(afc_dislike_num as signed)*(-6) + cast(afc_com_num as signed)*3 +
cast(afc_read_num as signed )*1 + cast(afc_rep_num as signed )*2 as atl_art_score
from NR.ArtFeatureCount left join NR.Article on NR.ArtFeatureCount.afc_art_id = NR.Article.art_id
where
NR.Article.art_type in ("news_society", "news_military", "news_finance", "news_entertainment",
"news_game", "news_sports", "news_world", "news_tech", "news_car", "news_fashion") and
abs(timestampdiff(HOUR,now(),afc_art_time)) < 72
order by atl_art_score desc;
1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
← 新闻推荐系统总览 数据爬虫的设计与实现 →