/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- * isenglish: make a guess as to whether text argument is English text. * Used for filtering spam sent in other charsets when the * charset specifier is omitted or is in the body rather * than the headers (which seems to be very common in spam). * Copyright 2001 by Akkana Peck, akkana@shallowsky.com. * You are free to use or modify this program under the GPL. */ #include #include #include /* Table of English letter frequencies per 1000 letters, from * http://library.thinkquest.org/28005/flashed/thelab/cryptograms/frequency.shtml */ int EnglishLetterFrequencies[] = { 73, 9, 30, 44, 130, 28, 16, 35, 74, 2, 3, 35, 25, /* A - M */ 78, 74, 27, 3, 77, 63, 93, 27, 13, 16, 5, 19, 1 /* N - Z */ }; static int Debug = 0; int IsVowel(char c) { return (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'); } /* Returns a score from 0 to 100 indicating confidence that it's English */ int GetScore(char* str) { int letters[26] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int num = 0; int punct = 0; int alpha = 0; int unprint = 0; int vowels = 0; int total = 0; int words = 0; int thiswordlength = 0; int avwordlength = 0; int score; int fromStdin = (str == 0); while (fromStdin || (str && *str)) { int c = (fromStdin ? getchar() : *str); /* First check for word ends */ if ((isspace(c) || !isprint(c)) && thiswordlength > 0) { ++words; avwordlength += thiswordlength; thiswordlength = 0; } if (c == EOF) break; if (isdigit(c)) ++num; else if (isalpha(c)) { ++thiswordlength; ++alpha; c = tolower(c); if (IsVowel(c)) ++vowels; ++letters[c - 'a']; } else if (isprint(c)) ++punct; else ++unprint; ++total; if (!fromStdin) ++str; } /* Compare alphanum chars to punct chars */ score = (alpha + num) * 100 / total + 10; /* Check word lengths */ if (words > 0) avwordlength /= words; /* Unscientific: just root out extreme cases */ if (avwordlength < 3 && words > 5) score /= 2; if (score > 100) score = 100; if (Debug) { printf("Score = %d\n", score); printf(" (%d alpha, %d digit, %d punct, %d unprintable,\n", alpha, num, punct, unprint); printf(" %d words, av. word length %d)\n", words, avwordlength); } return score; } main(int argc, char **argv) { int score = 0; int arg = 1; while (argc > 1 && argv[1][0] == '-') { if (argv[arg][1] == 'd' || argv[arg][1] == 'D') { printf("Debug mode\n"); Debug = 1; } ++arg; --argc; } if (argc > 1) { int i; for (i = 1; i < argc; ++i) score += GetScore(argv[arg]); score /= (argc - 1); } else score = GetScore(0); printf("%d\n", score); if (score < 55) exit(1); exit(0); }