SC to be used with NP_SpamCheck
Original at: http://www.legolasweb.nl/content/view/44/35/
Filename: SC.SpamRater.php
<?php // // Spam Rater 1.3 // // By: Legolas // email: legolas@legolasweb.nl // web: http://www.legolasweb.nl/ // // // Use: // int spam_check(mixed $title, mixed $message[, string $blacklist]); // returns a value between 0 and 100, which represents the spam-rating // 100% - Spam // 70% - Good-chance it's spam // 0% - Probably not spam function SCType_SpamRater() { return "comment"; } function SC_SpamRater($message, $title) { GLOBAL $DIR_PLUGINS; $blacklist = $DIR_PLUGINS . "sc/spamrater.txt"; $rating = 0; $bl_switch = false; $message = stripslashes(strtolower(str_replace("'", "\"", $message))); $test_parts = preg_split("/(<a href=\"|\">|<\/a>)/", $message); $stuff = array(); $links = array(); $titles = array(); for ($i = 0; $i < count($test_parts); $i++) { if (floor($i / 3) == $i / 3) { $stuff[] = $test_parts[$i]; } elseif (floor(($i - 1) / 3) == ($i - 1) / 3) { $links[] = $test_parts[$i]; } elseif (floor(($i - 2) / 3) == ($i - 2) / 3) { $titles[] = $test_parts[$i]; } } if ($blacklist != null) { if (file_exists($blacklist)) { $bl_str = file_get_contents($blacklist); $bl_arr = explode(",", $bl_str); for ($i = 0; $i < count($bl_arr); $i++) { $bl_arr[$i] = base64_decode($bl_arr[$i]); } $bl_switch = true; } } $stuffwords = array(); for ($i = 0; $i < count($stuff); $i++) { $stuff_exp = preg_split("/( |\n|\r|\t)/", strip_tags($stuff[$i])); for ($j = 0; $j < count($stuff_exp); $j++) { if ($stuff_exp[$j] != null) { $stuffwords[] = $stuff_exp[$j]; } } } for ($i = 0; $i < count($titles); $i++) { $titles_exp = preg_split("/( |\n|\r|\t)/", strip_tags($titles[$i])); for ($j = 0; $j < count($titles_exp); $j++) { if ($titles_exp[$j] != null) { $stuffwords[] = $titles_exp[$j]; } } } // Links on text (-100) $test_worth = 100; $stuffchars = 0; $linkchars = 0; for ($i = 0; $i < count($stuff); $i++) { $stuffchars += strlen($stuff[$i]); } for ($i = 0; $i < count($links); $i++) { $linkchars += strlen($links[$i]); } $score = ($linkchars / $stuffchars) * $test_worth; if ($score > $test_worth) { $score = $test_worth; } $rating += $score - $test_worth; // Links test (50) $test_worth = 50; $unique_links = array(); for ($i = 0; $i < count($links); $i++) { if (!in_array($links[$i], $unique_links)) { $unique_links[] = $links[$i]; } } if (count($unique_links) > 0) { $rating += (count($links) / count($unique_links)) * ($test_worth / count($links)); } // Link base test (50) $test_worth = 50; $unique_link_bases = array(); $unique_link_sets = array(); $unique_link_querystrings = array(); for ($i = 0; $i < count($unique_links); $i++) { $ul = $unique_links[$i]; if (substr($ul, 0, 7) == "http://") { $ul = substr($ul, 7); } $ul_parts = explode("?", $ul); $unique_link_sets[$i] = explode("/", $ul_parts[0]); $ul_qs = null; if (count($ul_parts) > 1) { for ($j = 1; $j < count($ul_parts); $j++) { if ($j != 1) { $ul_qs .= "?"; } $ul_qs .= $ul_parts[$j]; } } if (!in_array($ul_qs, $unique_link_querystrings)) { $unique_link_querystrings[] = $ul_qs; } if (!in_array($unique_link_sets[$i][0], $unique_link_bases)) { $unique_link_bases[] = $unique_link_sets[$i][0]; } } if (count($unique_link_bases) > 0) { $rating += (count($unique_links) / count($unique_link_bases)) * ($test_worth / count($unique_links)); } // Link parts test (50) $test_worth = 50; $unique_link_parts = array(); $total_part_count = 0; for ($i = 0; $i < count($unique_link_sets); $i++) { for ($j = 1; $j < count($unique_link_sets[$i]); $j++) { $ul = $unique_link_sets[$i][$j]; if ($j == count($unique_link_sets[$i]) - 1) { $ulx = explode(".", $ul); $ul = null; for ($k = 0; $k < count($ulx) - 1; $k++) { if ($k != 0) { $ul .= "."; } $ul .= $ulx[$k]; } //if (strstr($ul, "?")) { // $ulx = explode("?", $ul); // $ul = $ulx[0]; //} } //if (substr($ul, -5) == ".html") { // $ul = substr($ul, 0, -5); //} //elseif (substr($ul, -4) == ".htm") { // $ul = substr($ul, 0, -4); //} $parts = preg_split("/(-|_)/", $ul); foreach ($parts as $part) { if (!in_array($part, $unique_link_parts) && $part != null) { $unique_link_parts[] = $part; } $total_part_count++; } } } if (count($unique_link_parts) > 0) { $rating += ($total_part_count / count($unique_link_parts)) * ($test_worth / $total_part_count); } // Black list test (50) $test_worth = 50; $test = 0; if ($bl_switch == true) { for ($i = 0; $i < count($stuffwords); $i++) { if (in_array($stuffwords[$i], $bl_arr)) { $test++; } } if ($test > 0) { $rating += (count($stuffwords) / $test) * ($test_worth / count($stuffwords)); } } // Black list link parts test (50) $test_worth = 50; $test = 0; if ($bl_switch == true) { for ($i = 0; $i < count($unique_link_parts); $i++) { if (in_array($unique_link_parts[$i], $bl_arr)) { $test++; } } if ($test > 0) { $rating += (count($unique_link_parts) / $test) * ($test_worth / count($unique_link_parts)); } } // Link title test (50) $test_worth = 50; $test = 0; for ($i = 0; $i < count($titles); $i++) { if (in_array($titles[$i], $links)) { $test++; } elseif ($titles[$i] == $title) { $test++; } } if ($test > 0) { $rating += (count($titles) / $test) * ($test_worth / count($titles)); } // Header test (10) $test_worth = 10; $test = 0; if (substr($message, 0, 4) == "<h1>" && substr($message, -5) == "</h1>") { $test = 1; } $rating += $test * $test_worth; // Link in stuff test (50) $test_worth = 50; $test = 0; for ($i = 0; $i < count($stuff); $i++) { for ($j = 0; $j < count($unique_links); $j++) { if (strpos($stuff[$i], $links[$j])) { $test++; } } } if ($test > 0) { $rating += (count($stuff) / $test) * ($test_worth / count($stuff)); } // Black list updating (15% of the text) $minimal_occurence = 15; $minimal_rating = 100; if ($bl_switch == true && $rating >= $minimal_rating) { $propose = array(); for ($i = 0; $i < count($stuffwords); $i++) { if (!in_array($stuffwords[$i], $bl_arr) && $stuffwords[$i] != null) { if (!array_key_exists($stuffwords[$i], $propose)) { $propose[$stuffwords[$i]] = 1; } else { $propose[$stuffwords[$i]]++; } } } for ($i = 0; $i < count($unique_link_parts); $i++) { if (!in_array($unique_link_parts[$i], $bl_arr) && $unique_link_parts[$i] != null) { if (!array_key_exists($unique_link_parts[$i], $propose)) { $propose[$unique_link_parts[$i]] = 1; } else { $propose[$unique_link_parts[$i]]++; } } } $total_propose = 0; foreach ($propose as $value) { $total_propose += $value; } foreach ($propose as $key => $value) { if ($value > $total_propose * ($minimal_occurence / 100)) { $bl_arr[] = $key; } } $bl_str = null; $out = array(); for ($i = 0; $i < count($bl_arr); $i++) { if (!empty($bl_arr[$i])) { $out[] = $bl_arr[$i]; } } $bl_arr = $out; for ($i = 0; $i < count($bl_arr); $i++) { if ($i != 0) { $bl_str .= ","; } $bl_str .= base64_encode($bl_arr[$i]); } $fh = fopen($blacklist, "w"); fwrite($fh, $bl_str); fclose($fh); } if ($rating < 0) { $rating = 0; } if ($rating > 100) { $rating = 100; } if ($rating > 70) { return true; } return false; } ?>