#@#Default configuration values Combine system #Direct connection to Zebra indexing - for SearchEngine-in-a-box (default no connection) #@#ZebraHost = NoDefaultValue ZebraHost = #Use a proxy server if this is defined (default no proxy) #@#httpProxy = NoDefaultValue httpProxy = #Enable(1)/disable(0) automatic recycling of new links AutoRecycleLinks = 1 #User agent handles redirects (1) or treat redirects as new links (0) UserAgentFollowRedirects = 0 #Number of pages to process before restarting the harvester HarvesterMaxMissions = 500 #Logging level (0 (least) - 10 (most)) Loglev = 0 #Enable(1)/disable(0) analysis of genre, language doAnalyse = 1 #How long the summary should be. Use 0 to disable the summarization code SummaryLength = 0 #Store(1)/do not store(0) the raw HTML in the database saveHTML = 1 #Use(1)/do not use(0) Tidy to clean the HTML before parsing it useTidy = 1 #Use(1)/do not use(0) OAI record status keeping in SQL database doOAI = 1 #Extract(1)/do not extract(0) links from plain text extractLinksFromText = 1 #Enable(1)/disable(0) topic classification (focused crawling) #Generated by combineINIT based on --topic parameter doCheckRecord = 0 #Which topic classification PlugIn module algorithm to use #Combine::Check_record and Combine::PosCheck_record included by default #see classifyPlugInTemplate.pm and documentation to write your own classifyPlugIn = Combine::Check_record ###Parameters for Std topic classification algorithm ###StdTitleWeight = 10 # ###StdMetaWeight = 4 # ###StdHeadingsWeight = 2 # ###StdCutoffRel = 10 #Class score must be above this % to be counted ###StdCutoffNorm = 0.2 #normalised cutoff for summed normalised score ###StdCutoffTot = 90 #non normalised cutoff for summed total score ###Parameters for Pos topic classification algorithm ###PosCutoffRel = 1 #Class score must be above this % to be counted ###PosCutoffNorm = 0.002 #normalised cutoff for summed normalised score ###PosCutoffTot = 1 #non normalised cutoff for summed total score HarvestRetries = 5 SdqRetries = 5 #Maximum length of a URL; longer will be silently discarded maxUrlLength = 250 #Time in seconds to wait for a server to respond UAtimeout = 30 #If we have seen this page before use Get-If-Modified (1) or not (0) UserAgentGetIfModifiedSince = 1 WaitIntervalExpirationGuaranteed = 315360000 WaitIntervalHarvesterLockNotFound = 2592000 WaitIntervalHarvesterLockNotModified = 2592000 WaitIntervalHarvesterLockRobotRules = 2592000 WaitIntervalHarvesterLockUnavailable = 86400 WaitIntervalRrdLockDefault = 86400 WaitIntervalRrdLockNotFound = 345600 WaitIntervalRrdLockSuccess = 345600 #Time in seconds after succesfull download before allowing a page to be downloaded again (around 11 days) WaitIntervalHarvesterLockSuccess = 1000000 #Time in seconds to wait before making a new reschedule if a reschedule results in an empty ready que WaitIntervalSchedulerGetJcf = 20 #Minimum time between accesses to the same host. Must be positive WaitIntervalHost = 60 #Identifies MySQL database name, user and host MySQLdatabase = NoDefaultValue #Base directory for configuration files; initialized by Config.pm #@#baseConfigDir = /etc/combine #Directory for job specific configuration files; taken from 'jobname' #@#configDir = NoDefaultValue #Extensions of binary files ps jpg jpeg pdf tif tiff mpg mpeg mov wav au hqx gz z tgz exe zip sdd doc rtf shar mat raw wmz arff rar #Configure which converters can be used to produce a XWI object #Format: # 1 line per entry # each entry consists of 3 ';' separated fields # #Entries are processed in order and the first match is executed # external converters have to be found via PATH and executable to be considered a match # the external converter command should take a filename as parameter and convert that file # the result should be comming on STDOUT # # mime-type ; External converter command ; Internal converter text/html ; ; GuessHTML #Check this www/unknown ; ; GuessHTML text/plain ; ; GuessText text/x-tex ; tth -g -w1 -r < ; TeXHTML application/x-tex ; tth -g -w1 -r < ; TeXHTML text/x-tex ; untex -a -e -giso ; TeXText application/x-tex ; untex -a -e -giso ; TeXText text/x-tex ; ; TeX application/x-tex ; ; TeX application/pdf ; pdftohtml -i -noframes -nomerge -stdout ; HTML application/pdf ; pstotext ; Text application/postscript ; pstotext ; Text application/msword ; antiword -t ; Text application/vnd.ms-excel ; xlhtml -fw ; HTML application/vnd.ms-powerpoint ; ppthtml ; HTML application/rtf ; unrtf --nopict --html ; HTML image/gif ; ; Image image/jpeg ; ; Image image/tiff ; ; Image #Exclude URLs or hostnames that matches these regular expressions #Malformed hostnames HOST: http:\/\/\. HOST: \@