#!/usr/bin/perl -w

###  retrieve_similar_tickets.pl

###  After the tickets stored in an Excel spreadsheet have been subject to the
###  preprocessing steps listed in the script `ticket_preprocessor_doc_modeler.pl',
###  you use the script shown here to retrieve the tickets that are most similar
###  to a given query ticket.

###  For obvious reasons, you would want the names of the database files
###  mentioned in this script to match the names in the ticket
###  preprocessing script.

###  IMPORTANT  IMPORTANT  IMPORTANT  IMPORTANT  IMPORTANT:
###
###  The parameter
###
###                 min_idf_threshold 
###
###
###  depends on the number of tickets in your Excel spreadsheet.  If the
###  number of tickets is in the low hundreds, this parameter is likely to
###  require a value of 1.5 to 1.8.  If the number of tickets is in the
###  thousands, the value of this parameter is likely to be between 2 and
###  3.  See the writeup on this parameter in the API description in the
###  main documentation.


use lib '../blib/lib', '../blib/arch';

use strict;
use Algorithm::TicketClusterer;

my $fieldname_for_clustering = "Description";
my $unique_id_fieldname = "Request No";
my $raw_tickets_db = "raw_tickets.db";
my $processed_tickets_db = "processed_tickets.db";
my $stemmed_tickets_db = "stemmed_tickets.db";
my $inverted_index_db = "inverted_index.db";
my $tickets_vocab_db = "tickets_vocab.db";
my $idf_db = "idf.db";
my $tkt_doc_vecs_db = "tkt_doc_vecs.db";
my $tkt_doc_vecs_normed_db = "tkt_doc_vecs_normed.db";

my $clusterer = Algorithm::TicketClusterer->new( 

                     clustering_fieldname      => $fieldname_for_clustering,
                     unique_id_fieldname       => $unique_id_fieldname,
                     raw_tickets_db            => $raw_tickets_db,
                     processed_tickets_db      => $processed_tickets_db,
                     stemmed_tickets_db        => $stemmed_tickets_db,
                     inverted_index_db         => $inverted_index_db,
                     tickets_vocab_db          => $tickets_vocab_db,
                     idf_db                    => $idf_db,
                     tkt_doc_vecs_db           => $tkt_doc_vecs_db,
                     tkt_doc_vecs_normed_db    => $tkt_doc_vecs_normed_db,
                     min_idf_threshold         => 1.3,
                     how_many_retrievals       => 5,
                     debug3                    => 1,  # for similarity retrieval
                );

#my $ticket_num = 1377224;

my $ticket_num = 1377212;

$clusterer->restore_ticket_vectors_and_inverted_index();

my $retrieved_hash_ref = $clusterer->retrieve_similar_tickets_with_vsm( $ticket_num );

print "\nDisplaying the tickets considered most similar to the query ticket $ticket_num\n\n";

my %retrieved_hash = %{$retrieved_hash_ref};
my $rank = 1;
foreach my $ticket_id (sort { $retrieved_hash{$b} <=> $retrieved_hash{$a} } 
                                                          keys %retrieved_hash) {
    my $similarity_score = $retrieved_hash{$ticket_id};
    print "\n\n\n --------- Retrieved ticket at similarity rank $rank   (simlarity score: $similarity_score) ---------\n";
    $clusterer->show_processed_ticket_clustering_data_for_given_id( $ticket_id );    
    $clusterer->show_original_ticket_for_given_id( $ticket_id );
    $rank++;
}