package WWW::Wappalyzer; use 5.006; use strict; use warnings; use base qw ( Exporter ); our @EXPORT_OK = qw( detect get_categories add_clues_file ); use lib::abs; use JSON qw(); my %_categories; my @_clues_file_list = ( lib::abs::path( './apps.json' ) ); # List of multi per web page application categories my %MULTIPLE_APP_CATS = map { $_ => 1 } qw( widgets analytics javascript-frameworks video-players font-scripts miscellaneous advertizing-networks ); =head1 NAME WWW::Wappalyzer - Perl port of Wappalyzer (L) =head1 DESCRIPTION Uncovers the technologies used on websites: detects content management systems, web shops, web servers, JavaScript frameworks, analytics tools and many more. Lacks 'version' and 'confidence' support of original Wappalyzer in favour of speed. Clues: L More info: L =head1 VERSION Version 0.06 =cut our $VERSION = '0.06'; =head1 SYNOPSIS use WWW::Wappalyzer; use LWP::UserAgent; my $response = LWP::UserAgent->new->get( 'http://www.drupal.org' ); my %detected = WWW::Wappalyzer::detect( html => $response->decoded_content, headers => $response->headers, ); # %detected = ( # 'web-servers' => [ 'Apache' ], # 'cms' => [ 'Drupal' ], # 'cache-tools' => [ 'Varnish' ], # 'analytics' => [ 'Google Analytics' ], # 'operating-systems' => [ 'CentOS' ] # ); =head1 EXPORT None by default. =head1 SUBROUTINES/METHODS =head2 detect my %detected = detect( %params ) Tries to detect CMS, framework, etc for given html code, http headers, url. Available parameters: html - html code of web page headers - hash ref to http headers list url - url of web page cats - array ref to a list of trying categories, defaults to all categories; less cats => less cpu usage Returns the hash of detected applications by categorie: ( cms => [ 'Joomla' ], 'javascript-frameworks' => [ 'jQuery', 'jQuery UI' ], ) =cut sub detect { my %params = @_; return () unless $params{html} || $params{headers} || $params{url}; # Lazy load and process clues from JSON file _load_categories() unless scalar keys %_categories; my @cats = $params{cats} && ( ref( $params{cats} ) || '' ) eq 'ARRAY' ? @{ $params{cats} } : get_categories(); my $headers_ref; if ( $params{headers} ) { # make all headers name lowercase while ( my ( $name, $value ) = each %{ $params{headers} } ) { $headers_ref->{ lc $name } = $value; } } my %detected; my %tried_multi_cat_apps; for my $cat ( @cats ) { my $apps_ref = $_categories{ $cat } or die "Unknown categorie $cat"; APP: for my $app_ref ( @$apps_ref ) { my $detected; # Some speed optimizations if ( @cats > 1 && $app_ref->{multi_cat} && exists $tried_multi_cat_apps{ $app_ref->{name} } ) { $detected = $tried_multi_cat_apps{ $app_ref->{name} }; } else { # Try regexes... if ( defined $headers_ref && exists $app_ref->{headers_re} ) { my %headers_re = %{ $app_ref->{headers_re} }; while ( my ( $header, $re ) = each %headers_re ) { my $header_val = $headers_ref->{ $header } or next; if ( $header_val =~ m{$re}ims ) { $detected = 1; last; } } } unless ( $detected ) { # try from most to least relevant method for my $re_type ( qw( html url ) ) { if ( defined $params{ $re_type } && exists $app_ref->{ $re_type. '_re' } && $params{ $re_type } =~ m{$app_ref->{ $re_type. '_re' }}ims ) { $detected = 1; last; } } } # Some speed optimizations if ( @cats > 1 && $app_ref->{multi_cat} ) { $tried_multi_cat_apps{ $app_ref->{name} } = $detected; } } next unless $detected; # Detected! push @{ $detected{ $cat } }, $app_ref->{name}; last APP unless $MULTIPLE_APP_CATS{ $cat }; } } return %detected; } =head2 get_categories my @cats = get_categories() Returns the array of all application categories. =cut sub get_categories { # Lazy load and process clues from JSON files _load_categories() unless scalar keys %_categories; return keys %_categories; } # Loads and processes clues from JSON files sub _load_categories { for my $clue_file ( @_clues_file_list ) { open my $fh, '<', $clue_file or die "Can not read clues file $clue_file."; local $/ = undef; my $json = <$fh>; close $fh; # Do not support "Optional fields" $json =~ s{ \\\\; (?: version | confidence ) [^"]+? " }{"}xig; my $cfg_ref = eval { JSON::decode_json( $json ) }; die "Can't parse clue file $clue_file: $@" if $@; my $cats_ref = $cfg_ref->{categories} or die "Broken clues file $clue_file. Can not find categories."; my $apps_ref = $cfg_ref->{apps} or die "Broken clues file $clue_file. Can not find applications."; # Process apps while ( my ( $app, $app_ref ) = each %$apps_ref ) { my $new_app_ref = _process_app_clues( $app, $app_ref ) or next; my @cats = @{ $app_ref->{cats} } or next; $new_app_ref->{multi_cat} = 1 if @cats > 1; for my $cat_id ( @cats ) { my $cat = $cats_ref->{ $cat_id } or next; push @{ $_categories{ $cat } }, $new_app_ref; } } } } # Process clues of given app sub _process_app_clues { my ( $app, $app_ref ) = @_; my $new_app_ref = { name => $app }; my @fields = grep { exists $app_ref->{$_} } qw( script html meta headers url ); my @html_re; # Precompile regexps for my $field ( @fields ) { my $re_ref = $app_ref->{$field}; my @re_list = !ref $re_ref ? $re_ref : ref $re_ref eq 'ARRAY' ? ( map { _escape_re( $_ ) } @$re_ref ) : () ; if ( $field eq 'html' ) { push @html_re, map { qr/(?-x:$_)/ } @re_list; } elsif ( $field eq 'script' ) { push @html_re, map { qr/ < \s* script [^>]+ src \s* = \s* ["'] (?-x:[^"']*$_[^"']*) ["'] /x } @re_list; } elsif ( $field eq 'url' ) { $new_app_ref->{url_re} = join ' | ', map { qr/(?-x:$_)/ } @re_list; $new_app_ref->{url_re} = qr/$new_app_ref->{url_re}/x; } elsif ( $field eq 'meta' ) { for my $key ( keys %$re_ref ) { my $name_re = qr{ name \s* = \s* ["']? $key ["']? }x; my $re = _escape_re( $re_ref->{$key} ); $re = qr/$re/; my $content_re = qr{ content \s* = \s* ["'] (?-x:[^"']*$re[^"']*) ["'] }x; push @html_re, qr/ < \s* meta \s+ (?: (?: $name_re \s+ $content_re ) # | (?: $content_re \s+ $name_re ) # hangs sometimes ) /x; } } elsif ( $field eq 'headers' ) { for my $key ( keys %$re_ref ) { my $re = _escape_re( $re_ref->{$key} ); $new_app_ref->{headers_re}{ lc $key } = qr/$re/; } } } if ( @html_re ) { # Clue all html regexps into one regexp $new_app_ref->{html_re} = join ' | ', map { "(?: $_ )" } @html_re; $new_app_ref->{html_re} = qr/$new_app_ref->{html_re}/x; } return $new_app_ref; } # Escape special symbols in regexp string of config file sub _escape_re { my ( $re ) = @_; # Escape { } braces $re =~ s/ ([{}]) /[$1]/xig; return $re; } =head2 add_clues_file add_clues_file( $filepath ) Puts additional clues file to a list of processed clues files. See apps.json as format sample. =cut sub add_clues_file { my ( $filepath ) = @_; push @_clues_file_list, $filepath; # just clear out categories to lazy load later %_categories = (); } =head1 AUTHOR Alexander Nalobin, C<< >> =head1 BUGS Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head1 SUPPORT You can find documentation for this module with the perldoc command. perldoc WWW::Wappalyzer You can also look for information at: =over 4 =item * RT: CPAN's request tracker (report bugs here) L =item * AnnoCPAN: Annotated CPAN documentation L =item * CPAN Ratings L =item * Search CPAN L =back =head1 ACKNOWLEDGEMENTS =head1 LICENSE AND COPYRIGHT Copyright 2013 Alexander Nalobin. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See http://dev.perl.org/licenses/ for more information. =cut 1; # End of WWW::Wappalyzer