#!/usr/bin/perl -w use strict; use Lingua::EN::Tagger; my $COUNT = 50; die "usage: phrases.pl inputfile speaker length occurrences\n" unless (scalar @ARGV) == 4; my ($input, $speaker, $len, $occ) = @ARGV; $speaker = lc($speaker); my ($speakers) = parse($input); my $phrases = phrases( $speakers->{$speaker}, $len, $occ ); my @ranked = sort { $phrases->{$b} <=> $phrases->{$a} } keys %{ $phrases }; print "Top $COUNT phrases for speaker '$speaker'\n" . "-------------------------------------------\n"; for my $i (1..$COUNT) { print "$i. " . $ranked[$i] . " (" . $phrases->{ $ranked[$i] } . ")\n" if $ranked[$i]; } # phrases - returns noun phrases for $text and filters for phrases of # at least $length long and $occur occurences sub phrases { my ($text, $length, $occur) = @_; my $p = new Lingua::EN::Tagger( stem => 0, unknown_word_tag => "nn" ); my %dict = $p->get_words( $speakers->{$speaker} ); foreach my $phrase (keys %dict) { # count the # of words in the phrase my @words = split(/ /, $phrase); my $word_cnt = ( scalar @words ) || 1; # determine the # of phrases my $phrase_cnt = $dict{$phrase} / $word_cnt; # score = # of phrases + # of words $dict{$phrase} = $phrase_cnt + $word_cnt; # filter delete $dict{$phrase} if ($phrase_cnt < $occur) || ($word_cnt < $length); } return \%dict; } sub parse { my ($fn) = @_; my %speakers = (); open IN, "<$fn"; while () { chomp; my ($speaker, $text) = /^(LEHRER|BUSH|KERRY): (.*)$/og; $speakers{lc($speaker)} .= " $text"; } close IN; return \%speakers; }