#!/usr/bin/perl # Takes a newsgroup message from the 20 newsgroup dataset and # converts it to a list of the words it contains. my $body = 0; while () { chomp; if ($body) { s/[\'\r\n\":,\?\-\[\]\(\);\t<>\+\*\_\^\`\%\!\{\}[:cntrl:]\|]/ /g; # Convert separators to blank s/\.(\.)+/ /g; # Remove series of more than one "." s/\~(\~)+/ /g; # Remove series of more than one "~" s/\. / /g; # Remove "." but keep if not before a blank (may be an email or url) s/ +/ /g; s/^ //g; s/ $//g; if (length>0) { foreach (split ' ', lc) { if (/[[:alpha:]]/) { # Print only if it contains at each a letter print; print "\n"; } } } } else { if (/^Lines: /) { $body = 1; } } }