文章目录[隐藏]
一步步教你,为WordPress网站添加智能内容推荐引擎
引言:为什么你的WordPress网站需要智能推荐?
在信息过载的互联网时代,用户注意力已成为最稀缺的资源。据统计,拥有个性化内容推荐的网站,用户停留时间平均增加48%,页面浏览量提升35%,转化率提高20%以上。对于WordPress网站运营者而言,仅仅发布优质内容已远远不够,如何让访客发现更多相关内容,降低跳出率,提高用户粘性,已成为决定网站成败的关键因素。
目前市面上虽然有许多推荐插件,但它们往往存在诸多限制:要么功能过于简单,要么算法不够智能,要么需要支付高昂的订阅费用。更重要的是,这些通用插件无法完全契合每个网站独特的业务逻辑和用户需求。通过代码二次开发,我们不仅能打造完全符合自身需求的智能推荐引擎,还能将其深度集成到网站生态中,实现数据闭环和持续优化。
本文将带你从零开始,通过WordPress程序的代码二次开发,构建一个功能完整、算法可调的智能内容推荐系统。这个系统将包含基于内容的过滤、协同过滤、混合推荐等多种策略,并能通过用户行为数据不断自我优化。
第一部分:准备工作与环境搭建
1.1 理解WordPress推荐系统的基本原理
智能推荐引擎的核心是通过分析用户行为和内容特征,预测用户可能感兴趣的内容。在WordPress环境中,我们需要处理三种基本数据类型:
- 用户数据:注册用户信息、浏览历史、点击行为、停留时间、评分/点赞等
- 内容数据:文章标签、分类、关键词、元数据、作者、发布时间等
- 交互数据:用户与内容的每一次互动记录
推荐算法主要分为以下几类:
- 基于内容的推荐:分析用户过去喜欢的内容特征,推荐具有相似特征的新内容
- 协同过滤:找到与目标用户兴趣相似的其他用户,将他们喜欢的内容推荐给目标用户
- 混合推荐:结合多种推荐策略,取长补短,提高推荐质量
1.2 开发环境配置
在开始开发前,请确保你的环境满足以下条件:
- WordPress版本:5.6或更高版本(确保REST API功能完整)
- PHP版本:7.4或更高(推荐8.0+以获得更好性能)
- MySQL版本:5.7或更高(推荐8.0+)
- 必要的PHP扩展:JSON, cURL, MBString, XML
创建专用开发插件是推荐的做法,这可以确保你的代码与主题和其他插件隔离,便于维护和更新:
/*
Plugin Name: 智能内容推荐引擎
Plugin URI: https://yourwebsite.com/
Description: 为WordPress网站添加智能内容推荐功能
Version: 1.0.0
Author: 你的名字
License: GPL v2 or later
*/
// 防止直接访问
if (!defined('ABSPATH')) {
exit;
}
// 定义插件常量
define('SRE_VERSION', '1.0.0');
define('SRE_PLUGIN_DIR', plugin_dir_path(__FILE__));
define('SRE_PLUGIN_URL', plugin_dir_url(__FILE__));
1.3 数据库表设计
我们需要创建专门的数据库表来存储用户行为数据和推荐模型:
register_activation_hook(__FILE__, 'sre_create_tables');
function sre_create_tables() {
global $wpdb;
$charset_collate = $wpdb->get_charset_collate();
$table_prefix = $wpdb->prefix . 'sre_';
// 用户行为记录表
$user_actions_table = $table_prefix . 'user_actions';
$sql1 = "CREATE TABLE IF NOT EXISTS $user_actions_table (
id bigint(20) NOT NULL AUTO_INCREMENT,
user_id bigint(20) DEFAULT NULL,
session_id varchar(100) NOT NULL,
post_id bigint(20) NOT NULL,
action_type varchar(50) NOT NULL COMMENT 'view, click, like, share, etc.',
action_value decimal(5,2) DEFAULT 1.0,
created_at datetime DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id),
KEY user_id (user_id),
KEY post_id (post_id),
KEY action_type (action_type),
KEY created_at (created_at)
) $charset_collate;";
// 推荐缓存表
$recommendations_table = $table_prefix . 'recommendations';
$sql2 = "CREATE TABLE IF NOT EXISTS $recommendations_table (
id bigint(20) NOT NULL AUTO_INCREMENT,
user_id bigint(20) DEFAULT NULL,
session_id varchar(100) DEFAULT NULL,
post_id bigint(20) NOT NULL,
recommendation_type varchar(50) NOT NULL,
score decimal(5,4) NOT NULL,
algorithm_params text,
created_at datetime DEFAULT CURRENT_TIMESTAMP,
expires_at datetime NOT NULL,
PRIMARY KEY (id),
UNIQUE KEY unique_recommendation (user_id, session_id, post_id, recommendation_type(20)),
KEY score (score),
KEY expires_at (expires_at)
) $charset_collate;";
require_once(ABSPATH . 'wp-admin/includes/upgrade.php');
dbDelta($sql1);
dbDelta($sql2);
}
第二部分:用户行为追踪系统
2.1 设计用户行为数据模型
用户行为是推荐系统的燃料。我们需要设计一个灵活的系统来捕获各种用户交互:
class SRE_User_Tracker {
private static $instance = null;
private $session_id;
public static function get_instance() {
if (null === self::$instance) {
self::$instance = new self();
}
return self::$instance;
}
private function __construct() {
$this->session_id = $this->generate_session_id();
$this->init_hooks();
}
private function generate_session_id() {
if (isset($_COOKIE['sre_session_id']) && $_COOKIE['sre_session_id']) {
return sanitize_text_field($_COOKIE['sre_session_id']);
}
$session_id = wp_generate_uuid4();
setcookie('sre_session_id', $session_id, time() + 3600 * 24 * 30, '/');
return $session_id;
}
private function init_hooks() {
// 追踪页面浏览
add_action('wp', array($this, 'track_page_view'));
// 追踪点击事件(通过AJAX)
add_action('wp_ajax_sre_track_click', array($this, 'track_click_ajax'));
add_action('wp_ajax_nopriv_sre_track_click', array($this, 'track_click_ajax'));
// 追踪滚动深度
add_action('wp_footer', array($this, 'add_scroll_tracking_script'));
}
public function track_page_view() {
if (is_single() || is_page()) {
global $post;
$user_id = is_user_logged_in() ? get_current_user_id() : null;
$this->record_action(array(
'user_id' => $user_id,
'post_id' => $post->ID,
'action_type' => 'view',
'action_value' => 1.0,
'metadata' => array(
'referrer' => wp_get_referer(),
'user_agent' => $_SERVER['HTTP_USER_AGENT'] ?? '',
'scroll_depth' => 0 // 初始值,将通过JS更新
)
));
}
}
public function record_action($data) {
global $wpdb;
$table_name = $wpdb->prefix . 'sre_user_actions';
$defaults = array(
'user_id' => null,
'session_id' => $this->session_id,
'post_id' => 0,
'action_type' => 'unknown',
'action_value' => 1.0,
'metadata' => array(),
'created_at' => current_time('mysql')
);
$data = wp_parse_args($data, $defaults);
// 序列化元数据
$data['metadata'] = maybe_serialize($data['metadata']);
$wpdb->insert(
$table_name,
$data,
array('%d', '%s', '%d', '%s', '%f', '%s', '%s')
);
// 触发行为记录钩子,供其他功能使用
do_action('sre_user_action_recorded', $data);
return $wpdb->insert_id;
}
// 更多追踪方法...
}
2.2 实现实时行为追踪前端脚本
用户行为追踪需要前后端配合,以下是一个完整的前端追踪脚本:
// sre-frontend-tracker.js
(function() {
'use strict';
class SREFrontendTracker {
constructor() {
this.config = window.sreTrackerConfig || {};
this.sessionId = this.getSessionId();
this.userId = this.config.userId || 0;
this.currentPostId = this.config.postId || 0;
this.trackingQueue = [];
this.isTracking = false;
this.init();
}
getSessionId() {
let sessionId = this.getCookie('sre_session_id');
if (!sessionId) {
sessionId = this.generateUUID();
this.setCookie('sre_session_id', sessionId, 30);
}
return sessionId;
}
init() {
// 追踪链接点击
this.trackClicks();
// 追踪滚动深度
this.trackScrollDepth();
// 追踪阅读时间
this.trackReadingTime();
// 定期发送追踪数据
setInterval(() => this.flushQueue(), 5000);
// 页面卸载前发送剩余数据
window.addEventListener('beforeunload', () => this.flushQueueSync());
}
trackClicks() {
document.addEventListener('click', (e) => {
let target = e.target;
// 向上查找最近的链接
while (target && target !== document) {
if (target.tagName === 'A') {
this.handleLinkClick(target, e);
break;
}
target = target.parentNode;
}
}, true);
}
handleLinkClick(link, event) {
const href = link.getAttribute('href');
// 只追踪内部链接
if (!href || href.startsWith('#') || href.startsWith('javascript:')) {
return;
}
const isInternal = this.isInternalLink(href);
const linkText = link.textContent.trim().substring(0, 100);
const linkClasses = link.className;
const trackingData = {
action_type: 'click',
target_url: href,
link_text: linkText,
link_classes: linkClasses,
is_internal: isInternal,
position_x: event.clientX,
position_y: event.clientY
};
// 如果是内部链接,记录推荐点击
if (isInternal && link.closest('.sre-recommendation')) {
trackingData.action_type = 'recommendation_click';
const recType = link.closest('.sre-recommendation').dataset.recommendationType;
const recScore = link.closest('.sre-recommendation').dataset.recommendationScore;
trackingData.recommendation_type = recType;
trackingData.recommendation_score = recScore;
}
this.queueTracking('click', trackingData);
}
trackScrollDepth() {
let maxScrollDepth = 0;
let scrollCheckpoints = [25, 50, 75, 90, 100];
let reportedCheckpoints = new Set();
window.addEventListener('scroll', () => {
const scrollTop = window.pageYOffset || document.documentElement.scrollTop;
const scrollHeight = document.documentElement.scrollHeight - document.documentElement.clientHeight;
const currentDepth = scrollHeight > 0 ? Math.round((scrollTop / scrollHeight) * 100) : 0;
if (currentDepth > maxScrollDepth) {
maxScrollDepth = currentDepth;
// 报告达到的检查点
scrollCheckpoints.forEach(checkpoint => {
if (currentDepth >= checkpoint && !reportedCheckpoints.has(checkpoint)) {
this.queueTracking('scroll', {
scroll_depth: checkpoint,
max_depth: maxScrollDepth
});
reportedCheckpoints.add(checkpoint);
}
});
}
}, { passive: true });
}
trackReadingTime() {
let startTime = Date.now();
let activeTime = 0;
let lastActive = startTime;
let isActive = true;
// 检测用户活动
const activityEvents = ['mousemove', 'keydown', 'click', 'scroll'];
const resetActive = () => {
if (!isActive) {
isActive = true;
lastActive = Date.now();
}
};
activityEvents.forEach(event => {
document.addEventListener(event, resetActive, { passive: true });
});
// 每10秒检查一次活动状态
setInterval(() => {
const now = Date.now();
if (isActive && now - lastActive < 10000) {
activeTime += now - lastActive;
}
lastActive = now;
isActive = false;
// 每分钟报告一次阅读时间
if (activeTime >= 60000) {
const minutes = Math.floor(activeTime / 60000);
this.queueTracking('reading_time', {
minutes: minutes,
total_active_ms: activeTime
});
activeTime = activeTime % 60000;
}
}, 10000);
}
queueTracking(actionType, data) {
this.trackingQueue.push({
timestamp: Date.now(),
action_type: actionType,
data: data
});
// 如果队列太长,立即发送
if (this.trackingQueue.length > 20) {
this.flushQueue();
}
}
async flushQueue() {
if (this.isTracking || this.trackingQueue.length === 0) {
return;
}
this.isTracking = true;
const queueToSend = [...this.trackingQueue];
this.trackingQueue = [];
try {
const response = await fetch(this.config.ajaxUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
},
body: new URLSearchParams({
action: 'sre_track_batch',
nonce: this.config.nonce,
session_id: this.sessionId,
user_id: this.userId,
post_id: this.currentPostId,
events: JSON.stringify(queueToSend)
})
});
if (!response.ok) {
// 如果发送失败,将数据重新放回队列(去重)
const failedEvents = queueToSend.filter(event =>
!this.trackingQueue.some(e => e.timestamp === event.timestamp)
);
this.trackingQueue = [...failedEvents, ...this.trackingQueue];
}
} catch (error) {
console.error('SRE Tracking Error:', error);
// 网络错误,重新放回队列
this.trackingQueue = [...queueToSend, ...this.trackingQueue];
} finally {
this.isTracking = false;
}
}
// 同步刷新队列(用于页面卸载前)
flushQueueSync() {
if (this.trackingQueue.length === 0) return;
const data = new URLSearchParams({
action: 'sre_track_batch',
nonce: this.config.nonce,
session_id: this.sessionId,
user_id: this.userId,
post_id: this.currentPostId,
events: JSON.stringify(this.trackingQueue),
sync: '1'
});
// 使用navigator.sendBeacon异步发送,不阻塞页面卸载
navigator.sendBeacon(this.config.ajaxUrl, data);
}
// 工具方法...
isInternalLink(href) {
try {
const url = new URL(href, window.location.origin);
return url.hostname === window.location.hostname;
} catch {
return false;
}
}
generateUUID() {
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
const r = Math.random() * 16 | 0;
const v = c === 'x' ? r : (r & 0x3 | 0x8);
return v.toString(16);
});
}
getCookie(name) {
const match = document.cookie.match(new RegExp('(^| )' + name + '=([^;]+)'));
return match ? decodeURIComponent(match[2]) : null;
}
setCookie(name, value, days) {
const date = new Date();
date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));
document.cookie = `${name}=${encodeURIComponent(value)}; expires=${date.toUTCString()}; path=/`;
}
}
// 初始化追踪器
document.addEventListener('DOMContentLoaded', () => {
window.sreTracker = new SREFrontendTracker();
});
})();
第三部分:内容特征提取与向量化
3.1 构建内容特征模型
为了进行智能推荐,我们需要将文章内容转化为机器可理解的数值特征:
class SRE_Content_Analyzer {
private $stop_words;
public function __construct() {
$this->load_stop_words();
}
private function load_stop_words() {
// 中文停用词列表(简化版)
$chinese_stop_words = array(
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你',
第三部分:内容特征提取与向量化(续)
3.2 实现TF-IDF特征提取
TF-IDF(词频-逆文档频率)是内容推荐中常用的特征提取方法,它能识别出对文档最具代表性的词语:
class SRE_TFIDF_Processor {
private $word_doc_freq = array(); // 词语的文档频率
private $total_docs = 0; // 总文档数
private $vocabulary = array(); // 词汇表
private $min_word_length = 2; // 最小词长
private $max_features = 1000; // 最大特征数
public function __construct() {
$this->load_existing_model();
}
/**
* 从数据库加载已有的TF-IDF模型
*/
private function load_existing_model() {
global $wpdb;
$table_name = $wpdb->prefix . 'sre_tfidf_model';
// 检查表是否存在
$table_exists = $wpdb->get_var(
$wpdb->prepare("SHOW TABLES LIKE %s", $table_name)
);
if ($table_exists) {
$model_data = $wpdb->get_results(
"SELECT word, doc_frequency FROM $table_name ORDER BY doc_frequency DESC LIMIT $this->max_features"
);
foreach ($model_data as $row) {
$this->word_doc_freq[$row->word] = (int)$row->doc_frequency;
$this->vocabulary[] = $row->word;
}
$this->total_docs = (int)$wpdb->get_var(
"SELECT option_value FROM {$wpdb->options} WHERE option_name = 'sre_total_docs'"
) ?: 0;
}
}
/**
* 训练TF-IDF模型
*/
public function train_model($force_retrain = false) {
if (!$force_retrain && !empty($this->vocabulary) && $this->total_docs > 0) {
return true; // 模型已存在
}
global $wpdb;
// 获取所有公开的文章
$posts = $wpdb->get_results(
"SELECT ID, post_title, post_content
FROM {$wpdb->posts}
WHERE post_status = 'publish'
AND post_type = 'post'
ORDER BY ID DESC
LIMIT 5000" // 限制数量以提高性能
);
$this->total_docs = count($posts);
$doc_word_freq = array(); // 文档-词语频率矩阵
// 第一遍:统计词语的文档频率
foreach ($posts as $post) {
$words = $this->extract_words($post);
$unique_words = array_unique($words);
foreach ($unique_words as $word) {
if (!isset($this->word_doc_freq[$word])) {
$this->word_doc_freq[$word] = 0;
}
$this->word_doc_freq[$word]++;
}
// 保存文档的词语频率
$doc_word_freq[$post->ID] = array_count_values($words);
}
// 按文档频率排序,选择最重要的特征词
arsort($this->word_doc_freq);
$this->vocabulary = array_slice(array_keys($this->word_doc_freq), 0, $this->max_features);
// 保存模型到数据库
$this->save_model();
// 计算并保存每篇文章的TF-IDF向量
$this->calculate_document_vectors($posts, $doc_word_freq);
return true;
}
/**
* 从文章中提取词语
*/
private function extract_words($post) {
$text = $post->post_title . ' ' . $post->post_content;
// 移除HTML标签
$text = wp_strip_all_tags($text);
// 中文分词(简化版,实际应用中应使用专业分词库)
if (function_exists('mb_split')) {
// 使用正则表达式匹配中文字符
preg_match_all('/[x{4e00}-x{9fa5}]+/u', $text, $matches);
$chinese_words = $matches[0] ?? array();
// 提取英文单词
preg_match_all('/b[a-zA-Z]{2,}b/', $text, $english_matches);
$english_words = $english_matches[0] ?? array();
$words = array_merge($chinese_words, $english_words);
} else {
// 简单的空格分割(适用于英文)
$words = preg_split('/s+/', $text);
}
// 过滤停用词和短词
$words = array_filter($words, function($word) {
$length = mb_strlen($word, 'UTF-8');
return $length >= $this->min_word_length
&& !$this->is_stop_word($word);
});
return array_values($words);
}
/**
* 判断是否为停用词
*/
private function is_stop_word($word) {
$stop_words = array(
'的', '了', '在', '是', '我', '有', '和', '就', '不', '人',
'都', '一', '个', '上', '也', '很', '到', '说', '要', '去',
'你', '会', '着', '没有', '看', '好', '自己', '这', '那',
'the', 'and', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of'
);
return in_array(mb_strtolower($word, 'UTF-8'), $stop_words);
}
/**
* 计算TF-IDF值
*/
private function calculate_tfidf($term_freq, $doc_freq) {
if ($doc_freq == 0) return 0;
$tf = 1 + log($term_freq); // 词频的对数变换
$idf = log(($this->total_docs + 1) / ($doc_freq + 1)) + 1; // 平滑处理
return $tf * $idf;
}
/**
* 计算文档向量
*/
private function calculate_document_vectors($posts, $doc_word_freq) {
global $wpdb;
$vectors_table = $wpdb->prefix . 'sre_document_vectors';
// 批量插入数据
$batch_size = 100;
$batch_data = array();
foreach ($posts as $post) {
$vector = array();
$word_freq = $doc_word_freq[$post->ID] ?? array();
foreach ($this->vocabulary as $word) {
$term_freq = $word_freq[$word] ?? 0;
$doc_freq = $this->word_doc_freq[$word] ?? 0;
if ($term_freq > 0 && $doc_freq > 0) {
$tfidf = $this->calculate_tfidf($term_freq, $doc_freq);
$vector[$word] = round($tfidf, 6);
}
}
// 归一化向量
$vector = $this->normalize_vector($vector);
$batch_data[] = array(
'post_id' => $post->ID,
'vector_data' => maybe_serialize($vector),
'updated_at' => current_time('mysql')
);
if (count($batch_data) >= $batch_size) {
$this->batch_insert_vectors($batch_data);
$batch_data = array();
}
}
// 插入剩余数据
if (!empty($batch_data)) {
$this->batch_insert_vectors($batch_data);
}
}
/**
* 批量插入向量数据
*/
private function batch_insert_vectors($batch_data) {
global $wpdb;
$table_name = $wpdb->prefix . 'sre_document_vectors';
$values = array();
$placeholders = array();
$data = array();
foreach ($batch_data as $row) {
$placeholders[] = "(%d, %s, %s)";
$data[] = $row['post_id'];
$data[] = $row['vector_data'];
$data[] = $row['updated_at'];
}
$query = "INSERT INTO $table_name (post_id, vector_data, updated_at)
VALUES " . implode(', ', $placeholders) . "
ON DUPLICATE KEY UPDATE
vector_data = VALUES(vector_data),
updated_at = VALUES(updated_at)";
$wpdb->query($wpdb->prepare($query, $data));
}
/**
* 归一化向量(余弦归一化)
*/
private function normalize_vector($vector) {
$norm = 0;
foreach ($vector as $value) {
$norm += $value * $value;
}
if ($norm > 0) {
$norm = sqrt($norm);
foreach ($vector as $key => $value) {
$vector[$key] = $value / $norm;
}
}
return $vector;
}
/**
* 获取文章的TF-IDF向量
*/
public function get_document_vector($post_id) {
global $wpdb;
$table_name = $wpdb->prefix . 'sre_document_vectors';
$vector_data = $wpdb->get_var(
$wpdb->prepare(
"SELECT vector_data FROM $table_name WHERE post_id = %d",
$post_id
)
);
if ($vector_data) {
return maybe_unserialize($vector_data);
}
// 如果向量不存在,实时计算
$post = get_post($post_id);
if (!$post) {
return array();
}
$words = $this->extract_words($post);
$word_freq = array_count_values($words);
$vector = array();
foreach ($this->vocabulary as $word) {
$term_freq = $word_freq[$word] ?? 0;
$doc_freq = $this->word_doc_freq[$word] ?? 0;
if ($term_freq > 0 && $doc_freq > 0) {
$tfidf = $this->calculate_tfidf($term_freq, $doc_freq);
$vector[$word] = round($tfidf, 6);
}
}
$vector = $this->normalize_vector($vector);
// 保存到数据库
$wpdb->replace(
$table_name,
array(
'post_id' => $post_id,
'vector_data' => maybe_serialize($vector),
'updated_at' => current_time('mysql')
),
array('%d', '%s', '%s')
);
return $vector;
}
/**
* 计算两个向量的余弦相似度
*/
public function cosine_similarity($vector1, $vector2) {
if (empty($vector1) || empty($vector2)) {
return 0;
}
$dot_product = 0;
$norm1 = 0;
$norm2 = 0;
// 合并所有键
$all_keys = array_unique(array_merge(array_keys($vector1), array_keys($vector2)));
foreach ($all_keys as $key) {
$v1 = $vector1[$key] ?? 0;
$v2 = $vector2[$key] ?? 0;
$dot_product += $v1 * $v2;
$norm1 += $v1 * $v1;
$norm2 += $v2 * $v2;
}
if ($norm1 == 0 || $norm2 == 0) {
return 0;
}
return $dot_product / (sqrt($norm1) * sqrt($norm2));
}
/**
* 保存模型到数据库
*/
private function save_model() {
global $wpdb;
$table_name = $wpdb->prefix . 'sre_tfidf_model';
// 清空旧数据
$wpdb->query("TRUNCATE TABLE $table_name");
// 批量插入新数据
$batch_size = 500;
$batch_data = array();
foreach ($this->word_doc_freq as $word => $freq) {
$batch_data[] = array(
'word' => $word,
'doc_frequency' => $freq
);
if (count($batch_data) >= $batch_size) {
$this->batch_insert_model($batch_data);
$batch_data = array();
}
}
if (!empty($batch_data)) {
$this->batch_insert_model($batch_data);
}
// 保存总文档数
update_option('sre_total_docs', $this->total_docs, false);
}
private function batch_insert_model($batch_data) {
global $wpdb;
$table_name = $wpdb->prefix . 'sre_tfidf_model';
$values = array();
$placeholders = array();
$data = array();
foreach ($batch_data as $row) {
$placeholders[] = "(%s, %d)";
$data[] = $row['word'];
$data[] = $row['doc_frequency'];
}
$query = "INSERT INTO $table_name (word, doc_frequency)
VALUES " . implode(', ', $placeholders);
$wpdb->query($wpdb->prepare($query, $data));
}
}
3.3 构建内容元数据特征
除了文本内容,文章的元数据也是重要的推荐特征:
class SRE_Metadata_Extractor {
/**
* 提取文章的元数据特征
*/
public function extract_metadata_features($post_id) {
$post = get_post($post_id);
if (!$post) {
return array();
}
$features = array();
// 1. 分类特征
$categories = wp_get_post_categories($post_id, array('fields' => 'names'));
foreach ($categories as $category) {
$features['cat_' . sanitize_title($category)] = 1;
}
// 2. 标签特征
$tags = wp_get_post_tags($post_id, array('fields' => 'names'));
foreach ($tags as $tag) {
$features['tag_' . sanitize_title($tag)] = 1;
}
// 3. 作者特征
$author = get_the_author_meta('display_name', $post->post_author);
$features['author_' . sanitize_title($author)] = 1;
// 4. 发布时间特征
$post_date = strtotime($post->post_date);
$features['year_' . date('Y', $post_date)] = 1;
$features['month_' . date('m', $post_date)] = 1;
$features['weekday_' . date('w', $post_date)] = 1;
// 5. 文章长度特征
$content_length = mb_strlen(wp_strip_all_tags($post->post_content), 'UTF-8');
$features['length_short'] = $content_length < 1000 ? 1 : 0;
$features['length_medium'] = ($content_length >= 1000 && $content_length < 3000) ? 1 : 0;
$features['length_long'] = $content_length >= 3000 ? 1 : 0;
// 6. 是否有特色图片
$features['has_thumbnail'] = has_post_thumbnail($post_id) ? 1 : 0;
// 7. 评论数量特征
$comment_count = get_comments_number($post_id);
$features['comments_none'] = $comment_count == 0 ? 1 : 0;
$features['comments_few'] = ($comment_count > 0 && $comment_count <= 10) ? 1 : 0;
$features['comments_many'] = $comment_count > 10 ? 1 : 0;
// 8. 阅读时间估计(基于字数)
$word_count = str_word_count(wp_strip_all_tags($post->post_content));
$reading_time = ceil($word_count / 200); // 假设每分钟阅读200字
$features['reading_time_quick'] = $reading_time <= 3 ? 1 : 0;
$features['reading_time_medium'] = ($reading_time > 3 && $reading_time <= 10) ? 1 : 0;
$features['reading_time_long'] = $reading_time > 10 ? 1 : 0;
return $features;
}
/**
* 计算元数据特征的相似度
*/
public function metadata_similarity($features1, $features2) {
if (empty($features1) || empty($features2)) {
return 0;
}
$intersection = array_intersect_key($features1, $features2);
$union = array_merge($features1, $features2);
if (empty($union)) {
return 0;
}
// Jaccard相似系数
return count($intersection) / count(array_unique(array_keys($union)));
}
/**
* 获取所有文章的元数据特征(用于批量处理)
*/
public function get_all_posts_metadata($limit = 1000) {
global $wpdb;
$posts = $wpdb->get_results(
$wpdb->prepare(
"SELECT ID FROM {$wpdb->posts}
