#@#Default configuration values Combine system #Direct connection to Zebra indexing - for SearchEngine-in-a-box (default no connection) #@#ZebraHost = NoDefaultValue ZebraHost = #Direct connection to Solr indexing #@#SolrHost = NoDefaultValue SolrHost = #Enable(1)/disable(0) fulltext-index in MySQL table search MySQLfulltext = 0 #Use a proxy server if this is defined (default no proxy) #@#httpProxy = NoDefaultValue httpProxy = #Enable(1)/disable(0) automatic recycling of new links AutoRecycleLinks = 1 #User agent handles redirects (1) or treat redirects as new links (0) UserAgentFollowRedirects = 0 #Number of pages to process before restarting the harvester HarvesterMaxMissions = 500 #Logging level (0 (least) - 10 (most)) Loglev = 0 #Enable(1)/disable(0) analysis of genre, language doAnalyse = 1 analysePlugin = relTextPlugin = #How long the summary should be. Use 0 to disable the summarization code SummaryLength = 0 #Store(1)/do not store(0) the raw HTML in the database saveHTML = 1 #Use(1)/do not use(0) Tidy to clean the HTML before parsing it useTidy = 0 #Use(1)/do not use(0) OAI record status keeping in SQL database doOAI = 1 #Extract(1)/do not extract(0) links from plain text extractLinksFromText = 1 #Enable(1)/disable(0) topic classification (focused crawling) #Generated by combineINIT based on --topic parameter doCheckRecord = 0 #Which topic classification PlugIn module algorithm to use #Combine::Check_record and Combine::PosCheck_record included by default #NEW SVM classifier: Combine::classifySVM #see classifyPlugInTemplate.pm and documentation to write your own classifyPlugIn = Combine::Check_record #Filename for the SVM model #@#SVMmodel = NoDefaultValue SVMmodel = ###Parameters for Std topic classification algorithm ###StdTitleWeight = 10 # ###StdMetaWeight = 4 # ###StdHeadingsWeight = 2 # ###StdCutoffRel = 10 #Class score must be above this % to be counted ###StdCutoffNorm = 0.2 #normalised cutoff for summed normalised score ###StdCutoffTot = 90 #non normalised cutoff for summed total score ###Parameters for Pos topic classification algorithm ###PosCutoffRel = 1 #Class score must be above this % to be counted ###PosCutoffNorm = 0.002 #normalised cutoff for summed normalised score ###PosCutoffTot = 1 #non normalised cutoff for summed total score HarvestRetries = 5 SdqRetries = 5 #Maximum length of a URL; longer will be silently discarded maxUrlLength = 250 #Time in seconds to wait for a server to respond UAtimeout = 30 #If we have seen this page before use Get-If-Modified (1) or not (0) UserAgentGetIfModifiedSince = 1 WaitIntervalExpirationGuaranteed = 315360000 WaitIntervalHarvesterLockNotFound = 2592000 WaitIntervalHarvesterLockNotModified = 2592000 WaitIntervalHarvesterLockRobotRules = 2592000 WaitIntervalHarvesterLockUnavailable = 86400 WaitIntervalRrdLockDefault = 86400 WaitIntervalRrdLockNotFound = 345600 WaitIntervalRrdLockSuccess = 345600 #Time in seconds after succesfull download before allowing a page to be downloaded again (around 11 days) WaitIntervalHarvesterLockSuccess = 1000000 #Time in seconds to wait before making a new reschedule if a reschedule results in an empty ready que WaitIntervalSchedulerGetJcf = 20 #Minimum time between accesses to the same host. Must be positive WaitIntervalHost = 60 #URL scheduling algorithm SchedulingAlgorithm = default #Identifies MySQL database name, user and host MySQLdatabase = NoDefaultValue #Base directory for configuration files; initialized by Config.pm #@#baseConfigDir = /etc/combine #Directory for job specific configuration files; taken from 'jobname' #@#configDir = NoDefaultValue #Extensions of binary files arff au avi class exe fig gif gz hqx ica jpeg jpg mat mdb mov mp3 mpeg mpg msi pcx pdb psd ram rar raw rmd rmx sav sdd shar tar tga tgz tif tiff vo wav wmv wmz xbm xpm z zip #Configure which converters can be used to produce a XWI object #Format: # 1 line per entry # each entry consists of 3 ';' separated fields # #Entries are processed in order and the first match is executed # external converters have to be found via PATH and executable to be considered a match # the external converter command should take a filename as parameter and convert that file # the result should be comming on STDOUT # # mime-type ; External converter command ; Internal converter text/html ; ; GuessHTML #Check this www/unknown ; ; GuessHTML text/plain ; ; GuessText text/x-tex ; tth -g -w1 -r < ; TeXHTML application/x-tex ; tth -g -w1 -r < ; TeXHTML text/x-tex ; untex -a -e -giso ; TeXText application/x-tex ; untex -a -e -giso ; TeXText text/x-tex ; ; TeX application/x-tex ; ; TeX application/pdf ; pdftohtml -i -noframes -nomerge -nodrm -stdout ; HTML application/pdf ; pstotext ; Text application/postscript ; pstotext ; Text application/msword ; antiword -t ; Text application/vnd.ms-excel ; xlhtml -fw ; HTML application/vnd.ms-powerpoint ; ppthtml ; HTML application/rtf ; unrtf --nopict --html ; HTML image/gif ; ; Image image/jpeg ; ; Image image/tiff ; ; Image #Exclude URLs or hostnames that matches these regular expressions #Malformed hostnames HOST: http:\/\/\. HOST: \@