#@#Default configuration values Combine system
#Direct connection to Zebra indexing - for SearchEngine-in-a-box (default no connection)
#@#ZebraHost = NoDefaultValue
ZebraHost =
#Direct connection to Solr indexing
#@#SolrHost = NoDefaultValue
SolrHost =
#Enable(1)/disable(0) fulltext-index in MySQL table search
MySQLfulltext = 0
#Use a proxy server if this is defined (default no proxy)
#@#httpProxy = NoDefaultValue
httpProxy =
#Enable(1)/disable(0) automatic recycling of new links
AutoRecycleLinks = 1
#User agent handles redirects (1) or treat redirects as new links (0)
UserAgentFollowRedirects = 0
#Number of pages to process before restarting the harvester
HarvesterMaxMissions = 500
#Logging level (0 (least) - 10 (most))
Loglev = 0
#Enable(1)/disable(0) analysis of genre, language
doAnalyse = 1
analysePlugin =
relTextPlugin =
#How long the summary should be. Use 0 to disable the summarization code
SummaryLength = 0
#Store(1)/do not store(0) the raw HTML in the database
saveHTML = 1
#Use(1)/do not use(0) Tidy to clean the HTML before parsing it
useTidy = 0
#Use(1)/do not use(0) OAI record status keeping in SQL database
doOAI = 1
#Extract(1)/do not extract(0) links from plain text
extractLinksFromText = 1
#Enable(1)/disable(0) topic classification (focused crawling)
#Generated by combineINIT based on --topic parameter
doCheckRecord = 0
#Which topic classification PlugIn module algorithm to use
#Combine::Check_record and Combine::PosCheck_record included by default
#NEW SVM classifier: Combine::classifySVM
#see classifyPlugInTemplate.pm and documentation to write your own
classifyPlugIn = Combine::Check_record
#Filename for the SVM model
#@#SVMmodel = NoDefaultValue
SVMmodel =
###Parameters for Std topic classification algorithm
###StdTitleWeight = 10 #
###StdMetaWeight = 4 #
###StdHeadingsWeight = 2 #
###StdCutoffRel = 10 #Class score must be above this % to be counted
###StdCutoffNorm = 0.2 #normalised cutoff for summed normalised score
###StdCutoffTot = 90 #non normalised cutoff for summed total score
###Parameters for Pos topic classification algorithm
###PosCutoffRel = 1 #Class score must be above this % to be counted
###PosCutoffNorm = 0.002 #normalised cutoff for summed normalised score
###PosCutoffTot = 1 #non normalised cutoff for summed total score
HarvestRetries = 5
SdqRetries = 5
#Maximum length of a URL; longer will be silently discarded
maxUrlLength = 250
#Time in seconds to wait for a server to respond
UAtimeout = 30
#If we have seen this page before use Get-If-Modified (1) or not (0)
UserAgentGetIfModifiedSince = 1
WaitIntervalExpirationGuaranteed = 315360000
WaitIntervalHarvesterLockNotFound = 2592000
WaitIntervalHarvesterLockNotModified = 2592000
WaitIntervalHarvesterLockRobotRules = 2592000
WaitIntervalHarvesterLockUnavailable = 86400
WaitIntervalRrdLockDefault = 86400
WaitIntervalRrdLockNotFound = 345600
WaitIntervalRrdLockSuccess = 345600
#Time in seconds after succesfull download before allowing a page to be downloaded again (around 11 days)
WaitIntervalHarvesterLockSuccess = 1000000
#Time in seconds to wait before making a new reschedule if a reschedule results in an empty ready que
WaitIntervalSchedulerGetJcf = 20
#Minimum time between accesses to the same host. Must be positive
WaitIntervalHost = 60
#URL scheduling algorithm
SchedulingAlgorithm = default
#Identifies MySQL database name, user and host
MySQLdatabase = NoDefaultValue
#Base directory for configuration files; initialized by Config.pm
#@#baseConfigDir = /etc/combine
#Directory for job specific configuration files; taken from 'jobname'
#@#configDir = NoDefaultValue
#Extensions of binary files
arff
au
avi
class
exe
fig
gif
gz
hqx
ica
jpeg
jpg
mat
mdb
mov
mp3
mpeg
mpg
msi
pcx
pdb
psd
ram
rar
raw
rmd
rmx
sav
sdd
shar
tar
tga
tgz
tif
tiff
vo
wav
wmv
wmz
xbm
xpm
z
zip
#Configure which converters can be used to produce a XWI object
#Format:
# 1 line per entry
# each entry consists of 3 ';' separated fields
#
#Entries are processed in order and the first match is executed
# external converters have to be found via PATH and executable to be considered a match
# the external converter command should take a filename as parameter and convert that file
# the result should be comming on STDOUT
#
# mime-type ; External converter command ; Internal converter
text/html ; ; GuessHTML
#Check this
www/unknown ; ; GuessHTML
text/plain ; ; GuessText
text/x-tex ; tth -g -w1 -r < ; TeXHTML
application/x-tex ; tth -g -w1 -r < ; TeXHTML
text/x-tex ; untex -a -e -giso ; TeXText
application/x-tex ; untex -a -e -giso ; TeXText
text/x-tex ; ; TeX
application/x-tex ; ; TeX
application/pdf ; pdftohtml -i -noframes -nomerge -nodrm -stdout ; HTML
application/pdf ; pstotext ; Text
application/postscript ; pstotext ; Text
application/msword ; antiword -t ; Text
application/vnd.ms-excel ; xlhtml -fw ; HTML
application/vnd.ms-powerpoint ; ppthtml ; HTML
application/rtf ; unrtf --nopict --html ; HTML
image/gif ; ; Image
image/jpeg ; ; Image
image/tiff ; ; Image
#Exclude URLs or hostnames that matches these regular expressions
#Malformed hostnames
HOST: http:\/\/\.
HOST: \@