#Please change
Operator-Email = "YourEmailAdress@YourDomain"
#Password not used yet. (Please change)
Password = "XxXxyYzZ"
#Configure which converters can be used to produce a XWI object
#Format:
# 1 line per entry
# each entry consists of 3 ';' separated fields
#
#Entries are processed in order and the first match is executed
# external converters have to be found via PATH and executable to be considered a match
# the external converter command should take a filename as parameter and convert that file
# the result should be comming on STDOUT
#
# mime-type ; External converter command ; Internal converter
application/pdf ; MYpdftohtml -i -noframes -nomerge -stdout ; HTML
#List of servernames that are aliases are in the file ./config_serveralias
# (automatically updated by other programs)
#use one server per line
#example
#www.100topwetland.com www.100wetland.com
# means that www.100wetland.com is replaced by www.100topwetland.com during URL normalization
<>
#use either URL or HOST: (obs ':') to match regular expressions to
# either the full URL or the HOST part of a URL.
#Allow crawl of URLs or hostnames that matches these regular expressions
HOST: .*$
#Exclude URLs or hostnames that matches these regular expressions
# default: CGI and maps
URL cgi-bin|htbin|cgi|\?|\.map$|_vti_
# default: binary files
URL \.exe$|\.zip$|\.tar$|\.tgz$|\.gz$|\.hqx$|\.sdd$|\.mat$|\.raw$
URL \.EXE$|\.ZIP$|\.TAR$|\.TGZ$|\.GZ$|\.HQX$|\.SDD$|\.MAT$|\.RAW$
# default: Unparsable documents
URL \.shar$|\.rmx$|\.rmd$|\.mdb$|\.sav$
URL \.SHAR$|\.RMX$|\.RMD$|\.MDB$|\.SAV$
# default: images
URL \.gif$|\.jpg$|\.jpeg$|\.xpm$|\.tif$|\.tiff$|\.mpg$|\.mpeg$|\.mov$|\.wav$|\.au$|\.pcx$|\.xbm$|\.tga$|\.psd$
URL \.GIF$|\.JPG$|\.JPEG$|\.XPM$|\.TIF$|\.TIFF$|\.MPG$|\.MPEG$|\.MOV$|\.WAV$|\.AU$|\.PCX$|\.XBM$|\.TGA$|\.PSD$
# default: other binary formats
URL \.pdb$|\.class$|\.ica$|\.ram$|.wmz$|.arff$|.rar$|\.vo$|\.fig$
URL \.PDB$|\.CLASS$|\.ICA$|\.RAM$|.WMZ$|.ARFF$|.RAR$|\.VO$|\.FIG$
#more excludes in the file config_exclude (automatically updated by other programs)
<>
#patterns to recognize and remove sessionids in URLs
sessionid
lsessionid
jsessionid
SID
PHPSESSID
SessionID
BV_SessionID
#url is just a conatiner for all URL related configuration patterns