[go: up one dir, main page]

File: rankmsg.ml

package info (click to toggle)
spamoracle 1.4-4
  • links: PTS
  • area: main
  • in suites: sarge
  • size: 232 kB
  • ctags: 240
  • sloc: ml: 1,198; makefile: 137; sh: 61
file content (91 lines) | stat: -rw-r--r-- 2,916 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
(***********************************************************************)
(*                                                                     *)
(*                 SpamOracle -- a Bayesian spam filter                *)
(*                                                                     *)
(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
(*                                                                     *)
(*  Copyright 2002 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  This file is distributed under the terms of the   *)
(*  GNU Public License version 2, http://www.gnu.org/licenses/gpl.txt  *)
(*                                                                     *)
(***********************************************************************)

(* $Id: rankmsg.ml,v 1.4 2003/03/23 09:14:11 xleroy Exp $ *)

(* Message ranking *)

open Mail
open Database

let word_count_in w res =
  let count = ref 0 in
  for i = 0 to Array.length res - 1 do
    if w = fst res.(i) then incr count
  done;
  !count

let add_word w p res =
  let i = ref 0 in
  while !i < Array.length res
     && abs_float (p -. 0.5) <= abs_float(snd res.(!i) -. 0.5)
  do
    incr i
  done;
  if !i < Array.length res then begin
    for j = Array.length res - 1 downto !i + 1 do
      res.(j) <- res.(j - 1)
    done;
    res.(!i) <- (w, p)
  end

let normalize (p : float) low high =
  if p > high then high else if p < low then low else p

let process_word (db, res) w =
  try
    let (g, b) = Hashtbl.find db.s_freq w in
    if word_count_in w res < !Config.max_repetitions then begin
      let g = 2 * g in
      let pgood = float g /. float db.s_num_good
      and pbad = float b /. float db.s_num_spam in
      let p = 
        normalize (pbad /. (pgood +. pbad)) 
                  !Config.low_freq_limit !Config.high_freq_limit in
      add_word w p res
    end
  with Not_found ->
    ()

let process_words ctx txt =
  Wordsplit.iter (process_word ctx) txt

let process_msg ctx m =
  iter_message (process_words ctx) m

let bayes_rule res =
  let probs = List.map snd (Array.to_list res) in
  let prod = List.fold_left ( *. ) 1.0 probs
  and cprod = List.fold_left ( *. ) 1.0 (List.map (fun x -> 1.0 -. x) probs) in
  prod /. (prod +. cprod)

type rank =
  { spam_prob: float;
    num_meaningful: int;
    explanation: string }

let rank_message db msg =
  let res = Array.make !Config.num_words_retained ("", 0.5) in
  process_msg (db, res) msg;
  let p = bayes_rule res in
  let meaningful = ref 0 in
  while !meaningful < Array.length res && fst res.(!meaningful) <> ""
  do incr meaningful done;
  let summary = Buffer.create 200 in
  for i = 0 to !meaningful - 1 do
    let (w, p) = res.(i) in
    Printf.bprintf summary "%s:%02d " w (truncate (p *. 100.0))
  done;
  { spam_prob = p;
    num_meaningful = !meaningful;
    explanation = Buffer.contents summary }