1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
|
(***********************************************************************)
(* *)
(* SpamOracle -- a Bayesian spam filter *)
(* *)
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
(* *)
(* Copyright 2002 Institut National de Recherche en Informatique et *)
(* en Automatique. This file is distributed under the terms of the *)
(* GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt *)
(* *)
(***********************************************************************)
(* $Id: rankmsg.ml,v 1.4 2003/03/23 09:14:11 xleroy Exp $ *)
(* Message ranking *)
open Mail
open Database
let word_count_in w res =
let count = ref 0 in
for i = 0 to Array.length res - 1 do
if w = fst res.(i) then incr count
done;
!count
let add_word w p res =
let i = ref 0 in
while !i < Array.length res
&& abs_float (p -. 0.5) <= abs_float(snd res.(!i) -. 0.5)
do
incr i
done;
if !i < Array.length res then begin
for j = Array.length res - 1 downto !i + 1 do
res.(j) <- res.(j - 1)
done;
res.(!i) <- (w, p)
end
let normalize (p : float) low high =
if p > high then high else if p < low then low else p
let process_word (db, res) w =
try
let (g, b) = Hashtbl.find db.s_freq w in
if word_count_in w res < !Config.max_repetitions then begin
let g = 2 * g in
let pgood = float g /. float db.s_num_good
and pbad = float b /. float db.s_num_spam in
let p =
normalize (pbad /. (pgood +. pbad))
!Config.low_freq_limit !Config.high_freq_limit in
add_word w p res
end
with Not_found ->
()
let process_words ctx txt =
Wordsplit.iter (process_word ctx) txt
let process_msg ctx m =
iter_message (process_words ctx) m
let bayes_rule res =
let probs = List.map snd (Array.to_list res) in
let prod = List.fold_left ( *. ) 1.0 probs
and cprod = List.fold_left ( *. ) 1.0 (List.map (fun x -> 1.0 -. x) probs) in
prod /. (prod +. cprod)
type rank =
{ spam_prob: float;
num_meaningful: int;
explanation: string }
let rank_message db msg =
let res = Array.make !Config.num_words_retained ("", 0.5) in
process_msg (db, res) msg;
let p = bayes_rule res in
let meaningful = ref 0 in
while !meaningful < Array.length res && fst res.(!meaningful) <> ""
do incr meaningful done;
let summary = Buffer.create 200 in
for i = 0 to !meaningful - 1 do
let (w, p) = res.(i) in
Printf.bprintf summary "%s:%02d " w (truncate (p *. 100.0))
done;
{ spam_prob = p;
num_meaningful = !meaningful;
explanation = Buffer.contents summary }
|