#Please change Operator-Email = "YourEmailAdress@YourDomain" #Password not used yet. (Please change) Password = "XxXxyYzZ" #Configure which converters can be used to produce a XWI object #Format: # 1 line per entry # each entry consists of 3 ';' separated fields # #Entries are processed in order and the first match is executed # external converters have to be found via PATH and executable to be considered a match # the external converter command should take a filename as parameter and convert that file # the result should be comming on STDOUT # # mime-type ; External converter command ; Internal converter application/pdf ; MYpdftohtml -i -noframes -nomerge -stdout ; HTML #List of servernames that are aliases are in the file ./config_serveralias # (automatically updated by other programs) #use one server per line #example #www.100topwetland.com www.100wetland.com # means that www.100wetland.com is replaced by www.100topwetland.com during URL normalization <> #use either URL or HOST: (obs ':') to match regular expressions to # either the full URL or the HOST part of a URL. #Allow crawl of URLs or hostnames that matches these regular expressions HOST: .*$ #Exclude URLs or hostnames that matches these regular expressions # default: CGI and maps URL cgi-bin|htbin|cgi|\?|\.map$|_vti_ # default: binary files URL \.exe$|\.zip$|\.tar$|\.tgz$|\.gz$|\.hqx$|\.sdd$|\.mat$|\.raw$ URL \.EXE$|\.ZIP$|\.TAR$|\.TGZ$|\.GZ$|\.HQX$|\.SDD$|\.MAT$|\.RAW$ # default: Unparsable documents URL \.shar$|\.rmx$|\.rmd$|\.mdb$|\.sav$ URL \.SHAR$|\.RMX$|\.RMD$|\.MDB$|\.SAV$ # default: images URL \.gif$|\.jpg$|\.jpeg$|\.xpm$|\.tif$|\.tiff$|\.mpg$|\.mpeg$|\.mov$|\.wav$|\.au$|\.pcx$|\.xbm$|\.tga$|\.psd$ URL \.GIF$|\.JPG$|\.JPEG$|\.XPM$|\.TIF$|\.TIFF$|\.MPG$|\.MPEG$|\.MOV$|\.WAV$|\.AU$|\.PCX$|\.XBM$|\.TGA$|\.PSD$ # default: other binary formats URL \.pdb$|\.class$|\.ica$|\.ram$|.wmz$|.arff$|.rar$|\.vo$|\.fig$ URL \.PDB$|\.CLASS$|\.ICA$|\.RAM$|.WMZ$|.ARFF$|.RAR$|\.VO$|\.FIG$ #more excludes in the file config_exclude (automatically updated by other programs) <> #patterns to recognize and remove sessionids in URLs sessionid lsessionid jsessionid SID PHPSESSID SessionID BV_SessionID #url is just a conatiner for all URL related configuration patterns