#@#Default configuration values Combine system
#Direct connection to Zebra indexing - for SearchEngine-in-a-box (default no connection)
#@#ZebraHost = NoDefaultValue
ZebraHost =
#Use a proxy server if this is defined (default no proxy)
#@#httpProxy = NoDefaultValue
httpProxy =
#Enable(1)/disable(0) automatic recycling of new links
AutoRecycleLinks = 1
#User agent handles redirects (1) or treat redirects as new links (0)
UserAgentFollowRedirects = 0
#Number of pages to process before restarting the harvester
HarvesterMaxMissions = 500
#Logging level (0 (least) - 10 (most))
Loglev = 0
#Enable(1)/disable(0) analysis of genre, language
doAnalyse = 1
#How long the summary should be. Use 0 to disable the summarization code
SummaryLength = 0
#Store(1)/do not store(0) the raw HTML in the database
saveHTML = 1
#Use(1)/do not use(0) Tidy to clean the HTML before parsing it
useTidy = 1
#Use(1)/do not use(0) OAI record status keeping in SQL database
doOAI = 1
#Extract(1)/do not extract(0) links from plain text
extractLinksFromText = 1
#Enable(1)/disable(0) topic classification (focused crawling)
#Generated by combineINIT based on --topic parameter
doCheckRecord = 0
#Which topic classification PlugIn module algorithm to use
#Combine::Check_record and Combine::PosCheck_record included by default
#see classifyPlugInTemplate.pm and documentation to write your own
classifyPlugIn = Combine::Check_record
###Parameters for Std topic classification algorithm
###StdTitleWeight = 10 #
###StdMetaWeight = 4 #
###StdHeadingsWeight = 2 #
###StdCutoffRel = 10 #Class score must be above this % to be counted
###StdCutoffNorm = 0.2 #normalised cutoff for summed normalised score
###StdCutoffTot = 90 #non normalised cutoff for summed total score
###Parameters for Pos topic classification algorithm
###PosCutoffRel = 1 #Class score must be above this % to be counted
###PosCutoffNorm = 0.002 #normalised cutoff for summed normalised score
###PosCutoffTot = 1 #non normalised cutoff for summed total score
HarvestRetries = 5
SdqRetries = 5
#Maximum length of a URL; longer will be silently discarded
maxUrlLength = 250
#Time in seconds to wait for a server to respond
UAtimeout = 30
#If we have seen this page before use Get-If-Modified (1) or not (0)
UserAgentGetIfModifiedSince = 1
WaitIntervalExpirationGuaranteed = 315360000
WaitIntervalHarvesterLockNotFound = 2592000
WaitIntervalHarvesterLockNotModified = 2592000
WaitIntervalHarvesterLockRobotRules = 2592000
WaitIntervalHarvesterLockUnavailable = 86400
WaitIntervalRrdLockDefault = 86400
WaitIntervalRrdLockNotFound = 345600
WaitIntervalRrdLockSuccess = 345600
#Time in seconds after succesfull download before allowing a page to be downloaded again (around 11 days)
WaitIntervalHarvesterLockSuccess = 1000000
#Time in seconds to wait before making a new reschedule if a reschedule results in an empty ready que
WaitIntervalSchedulerGetJcf = 20
#Minimum time between accesses to the same host. Must be positive
WaitIntervalHost = 60
#Identifies MySQL database name, user and host
MySQLdatabase = NoDefaultValue
#Base directory for configuration files; initialized by Config.pm
#@#baseConfigDir = /etc/combine
#Directory for job specific configuration files; taken from 'jobname'
#@#configDir = NoDefaultValue
#Extensions of binary files
ps
jpg
jpeg
pdf
tif
tiff
mpg
mpeg
mov
wav
au
hqx
gz
z
tgz
exe
zip
sdd
doc
rtf
shar
mat
raw
wmz
arff
rar
#Configure which converters can be used to produce a XWI object
#Format:
# 1 line per entry
# each entry consists of 3 ';' separated fields
#
#Entries are processed in order and the first match is executed
# external converters have to be found via PATH and executable to be considered a match
# the external converter command should take a filename as parameter and convert that file
# the result should be comming on STDOUT
#
# mime-type ; External converter command ; Internal converter
text/html ; ; GuessHTML
#Check this
www/unknown ; ; GuessHTML
text/plain ; ; GuessText
text/x-tex ; tth -g -w1 -r < ; TeXHTML
application/x-tex ; tth -g -w1 -r < ; TeXHTML
text/x-tex ; untex -a -e -giso ; TeXText
application/x-tex ; untex -a -e -giso ; TeXText
text/x-tex ; ; TeX
application/x-tex ; ; TeX
application/pdf ; pdftohtml -i -noframes -nomerge -stdout ; HTML
application/pdf ; pstotext ; Text
application/postscript ; pstotext ; Text
application/msword ; antiword -t ; Text
application/vnd.ms-excel ; xlhtml -fw ; HTML
application/vnd.ms-powerpoint ; ppthtml ; HTML
application/rtf ; unrtf --nopict --html ; HTML
image/gif ; ; Image
image/jpeg ; ; Image
image/tiff ; ; Image
#Exclude URLs or hostnames that matches these regular expressions
#Malformed hostnames
HOST: http:\/\/\.
HOST: \@