001    package org.maltparser.parser.guide.instance;
002    
003    import java.io.BufferedReader;
004    import java.io.BufferedWriter;
005    import java.io.IOException;
006    import java.util.SortedMap;
007    
008    import java.util.ArrayList;
009    import java.util.TreeMap;
010    import java.util.TreeSet;
011    import java.util.regex.Pattern;
012    
013    import org.maltparser.core.exception.MaltChainedException;
014    import org.maltparser.core.feature.FeatureException;
015    import org.maltparser.core.feature.FeatureVector;
016    import org.maltparser.core.feature.function.FeatureFunction;
017    import org.maltparser.core.feature.function.Modifiable;
018    import org.maltparser.core.feature.value.SingleFeatureValue;
019    import org.maltparser.core.syntaxgraph.DependencyStructure;
020    import org.maltparser.parser.guide.ClassifierGuide;
021    import org.maltparser.parser.guide.GuideException;
022    import org.maltparser.parser.guide.Model;
023    import org.maltparser.parser.history.action.SingleDecision;
024    
025    /**
026    The feature divide model is used for divide the training instances into several models according to
027    a divide feature. Usually this strategy decrease the training and classification time, but can also decrease 
028    the accuracy of the parser.  
029    
030    @author Johan Hall
031    @since 1.0
032    */
033    public class FeatureDivideModel implements InstanceModel {
034            private Model parent;
035            private final SortedMap<Integer,AtomicModel> divideModels;
036            private FeatureVector masterFeatureVector;
037            private FeatureVector divideFeatureVector;
038            private int frequency = 0;
039            private FeatureFunction divideFeature;
040            private int divideThreshold;
041            private AtomicModel masterModel;
042            private ArrayList<Integer> divideFeatureIndexVector;
043            
044            /**
045             * Constructs a feature divide model.
046             * 
047             * @param features the feature vector used by the atomic model.
048             * @param parent the parent guide model.
049             * @throws MaltChainedException
050             */
051            public FeatureDivideModel(FeatureVector features, Model parent) throws MaltChainedException {
052                    setParent(parent);
053                    setFrequency(0);
054                    initSplitParam(features);
055                    divideModels = new TreeMap<Integer,AtomicModel>();
056                    if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) {
057                            masterModel = new AtomicModel(-1, masterFeatureVector, this);
058                    } else if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) {
059                            load();
060                    }
061            }
062            
063            public void addInstance(SingleDecision decision) throws MaltChainedException {
064                    if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) {
065                            throw new GuideException("Can only add instance during learning. ");
066                    } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
067                            throw new GuideException("The divide feature does not have a single value. ");
068                    }
069                    
070                    divideFeature.update();
071                    if (divideModels != null) { 
072                            if (!divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
073                                    divideModels.put(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), new AtomicModel(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode(), divideFeatureVector, this));
074                            }
075                            divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).addInstance(decision);
076                    } else {
077                            throw new GuideException("The feature divide models cannot be found. ");
078                    }
079            }
080            
081            public void noMoreInstances() throws MaltChainedException {
082    //              if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) {
083    //                      throw new GuideException("Can only finish all data during learning. ");
084    //              }
085                    
086                    if (divideModels != null) {
087                            divideFeature.updateCardinality();
088                            for (Integer index : divideModels.keySet()) {
089                                    divideModels.get(index).noMoreInstances();
090                            }
091                            final TreeSet<Integer> removeSet = new TreeSet<Integer>();
092                            for (Integer index : divideModels.keySet()) {
093                                    if (divideModels.get(index).getFrequency() <= divideThreshold) {
094                                            divideModels.get(index).moveAllInstances(masterModel, divideFeature, divideFeatureIndexVector);
095                                            removeSet.add(index);
096                                    }
097                            }
098                            for (Integer index : removeSet) {
099                                    divideModels.remove(index);
100                            }
101                            masterModel.noMoreInstances();
102    
103                    } else {
104                            throw new GuideException("The feature divide models cannot be found. ");
105                    }
106            }
107    
108            public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException {
109    //              if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) {
110    //                      throw new GuideException("Can only finish sentence during learning. ");
111    //              }
112    
113                    if (divideModels != null) { 
114                            for (AtomicModel divideModel : divideModels.values()) {
115                                    divideModel.finalizeSentence(dependencyGraph);
116                            }
117                    } else {
118                            throw new GuideException("The feature divide models cannot be found. ");
119                    }
120            }
121    
122            public boolean predict(SingleDecision decision) throws MaltChainedException {
123                    if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) {
124                            throw new GuideException("Can only predict during parsing. ");
125                    } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
126                            throw new GuideException("The divide feature does not have a single value. ");
127                    }
128                    
129                    //divideFeature.update();
130                    if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
131                            return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()).predict(decision);
132                    } else if (masterModel != null && masterModel.getFrequency() > 0) {
133                            return masterModel.predict(decision);
134                    } else {
135                            getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " +
136                                            "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" +
137                                                            " class code '1' is used. ");
138                            
139                            decision.addDecision(1); // default prediction
140                            //classCodeTable.getEmptyKBestList().addKBestItem(1); 
141                    }
142                    return true;
143            }
144    
145            public FeatureVector predictExtract(SingleDecision decision) throws MaltChainedException {
146                    return getAtomicModel().predictExtract(decision);
147            }
148            
149            public FeatureVector extract() throws MaltChainedException {
150                    return getAtomicModel().extract();
151            }
152            
153            private AtomicModel getAtomicModel() throws MaltChainedException {
154                    if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) {
155                            throw new GuideException("Can only predict during parsing. ");
156                    } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) {
157                            throw new GuideException("The divide feature does not have a single value. ");
158                    }
159                    
160                    if (divideModels != null && divideModels.containsKey(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode())) {
161                            return divideModels.get(((SingleFeatureValue)divideFeature.getFeatureValue()).getCode());
162                    } else if (masterModel != null && masterModel.getFrequency() > 0) {
163                            return masterModel;
164                    } else {
165                            getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " +
166                                            "no divide or master model that covers the divide value '"+((SingleFeatureValue)divideFeature.getFeatureValue()).getCode()+"', as default" +
167                                                            " class code '1' is used. ");
168                    }
169                    return null;
170            }
171            
172            public void terminate() throws MaltChainedException {
173                    if (divideModels != null) {
174                            for (AtomicModel divideModel : divideModels.values()) { 
175                                    divideModel.terminate();
176                            }
177                    }
178                    if (masterModel != null) {
179                            masterModel.terminate();
180                    }
181            }
182            
183            public void train() throws MaltChainedException {
184                    for (AtomicModel divideModel : divideModels.values()) {
185                            divideModel.train();
186                    }
187                    masterModel.train();
188                    save();
189                    for (AtomicModel divideModel : divideModels.values()) {
190                            divideModel.terminate();
191                    }
192                    masterModel.terminate();
193            }
194            
195            /**
196             * Initialize the feature split parameters and the split feature vector and master feature vector
197             * according to the behavior strategy.
198             * 
199             * @param featureVector the parent guide model's feature vector.
200             * @throws MaltChainedException
201             */
202            protected void initSplitParam(FeatureVector featureVector) throws MaltChainedException {
203                    if (getGuide().getConfiguration().getOptionValue("guide", "data_split_column") == null 
204                                    || getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().length() == 0) {
205                            throw new GuideException("The option '--guide-data_split_column' cannot be found, when initializing the data split. ");
206                    }
207                    if (getGuide().getConfiguration().getOptionValue("guide", "data_split_structure") == null 
208                                    || getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().length() == 0) {
209                            throw new GuideException("The option '--guide-data_split_structure' cannot be found, when initializing the data split. ");
210                    }
211                    try {
212                            final String spec = "InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().trim()+
213                                                            ", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().trim() +")";
214                            divideFeature = featureVector.getFeatureModel().identifyFeature(spec);
215                    } catch (FeatureException e) {
216                            throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") cannot be initialized. ", e);
217                    }
218                    if (!(divideFeature instanceof Modifiable)) {
219                            throw new GuideException("The data split feature 'InputColumn("+getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString()+", "+getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString()+") does not implement Modifiable interface. ");
220                    }
221                    divideFeatureIndexVector = new ArrayList<Integer>();
222                    for (int i = 0; i < featureVector.size(); i++) {
223                            if (featureVector.get(i).equals(divideFeature)) {
224                                    divideFeatureIndexVector.add(i);
225                            }
226                    }
227                    
228    //              if ((Boolean)getGuide().getConfiguration().getOptionValue("malt0.4", "behavior") == true) {
229    //                      /* MaltParser 0.4 removes the divide feature for all divide models. For the "Sum-up" model or
230    //                       * master model adds the divide feature in the end of the feature vector.
231    //                       */
232    //                      masterFeatureVector = (FeatureVector)featureVector.clone();
233    //                      for (Integer i : divideFeatureIndexVector) {
234    //                              masterFeatureVector.remove(masterFeatureVector.get(i));
235    //                      }
236    //                      for (Integer i : divideFeatureIndexVector) {
237    //                              masterFeatureVector.add(featureVector.get(i));
238    //                      }
239    //              
240    //                      divideFeatureVector = (FeatureVector)featureVector.clone();
241    //                      for (Integer i : divideFeatureIndexVector) {
242    //                              divideFeatureVector.remove(divideFeatureVector.get(i));
243    //                      }
244    //              } else {
245                            masterFeatureVector = featureVector;
246                            divideFeatureVector = (FeatureVector)featureVector.clone();
247                            for (Integer i : divideFeatureIndexVector) {
248                                    divideFeatureVector.remove(divideFeatureVector.get(i));
249                            }
250    //              }
251                    try {
252                            if (getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString() != null) {
253                                    divideThreshold = Integer.parseInt(getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString());
254                            } else {
255                                    divideThreshold = 0;
256                            }
257                    } catch (NumberFormatException e) {
258                            throw new GuideException("The --guide-data_split_threshold option is not an integer value. ", e);
259                    }
260            }
261            
262            /**
263             * Saves the feature divide model settings .fsm file.
264             * 
265             * @throws MaltChainedException
266             */
267            protected void save() throws MaltChainedException {
268                    try {
269                            final BufferedWriter out = new BufferedWriter(getGuide().getConfiguration().getConfigurationDir().getOutputStreamWriter(getModelName()+".dsm"));
270                            out.write(masterModel.getIndex() + "\t" + masterModel.getFrequency() + "\n");
271    
272                            if (divideModels != null) {
273                                    for (AtomicModel divideModel : divideModels.values()) {
274                                            out.write(divideModel.getIndex() + "\t" + divideModel.getFrequency() + "\n");
275                            }
276                            }
277                            out.close();
278                    } catch (IOException e) {
279                            throw new GuideException("Could not write to the guide model settings file '"+getModelName()+".dsm"+"', when " +
280                                            "saving the guide model settings to file. ", e);
281                    }
282            }
283            
284            /**
285             * Loads the feature divide model settings .fsm file.
286             * 
287             * @throws MaltChainedException
288             */
289            protected void load() throws MaltChainedException {
290                    try {
291                            final BufferedReader in = new BufferedReader(getGuide().getConfiguration().getConfigurationDir().getInputStreamReaderFromConfigFile(getModelName()+".dsm"));
292                            final Pattern tabPattern = Pattern.compile("\t");
293                            while(true) {
294                                    String line = in.readLine();
295                                    if(line == null) break;
296                                    String[] cols = tabPattern.split(line);
297                                    if (cols.length != 2) { 
298                                            throw new GuideException("");
299                                    }
300                                    int code = -1;
301                                    int freq = 0;
302                                    try {
303                                            code = Integer.parseInt(cols[0]);
304                                            freq = Integer.parseInt(cols[1]);
305                                    } catch (NumberFormatException e) {
306                                            throw new GuideException("Could not convert a string value into an integer value when loading the feature divide model settings (.fsm). ", e);
307                                    }
308                                    if (code == -1) { 
309                                            masterModel = new AtomicModel(-1, masterFeatureVector, this);
310                                            masterModel.setFrequency(freq);
311                                    } else if (divideModels != null) {
312                                            divideModels.put(code, new AtomicModel(code, divideFeatureVector, this));
313                                            divideModels.get(code).setFrequency(freq);
314                                    }
315                                    setFrequency(getFrequency()+freq);
316                            }
317                            in.close();
318                    } catch (IOException e) {
319                            throw new GuideException("Could not read from the guide model settings file '"+getModelName()+".dsm"+"', when " +
320                                            "loading the guide model settings. ", e);
321                    }       
322            }
323            
324            /**
325             * Returns the parent model
326             * 
327             * @return the parent model
328             */
329            public Model getParent() {
330                    return parent;
331            }
332    
333            public ClassifierGuide getGuide() {
334                    return parent.getGuide();
335            }
336            
337            /**
338             * Sets the parent model
339             * 
340             * @param parent the parent model
341             */
342            protected void setParent(Model parent) throws MaltChainedException {
343                    this.parent = parent;
344            }
345    
346    
347            public String getModelName() throws MaltChainedException {
348                    try {
349                            return parent.getModelName();
350                    } catch (NullPointerException e) {
351                            throw new GuideException("The parent guide model cannot be found. ", e);
352                    }
353            }
354    
355            /**
356             * Returns the "sum-up" or master feature vector
357             * 
358             * @return a feature vector object
359             */
360            public FeatureVector getMasterFeatureVector() {
361                    return masterFeatureVector;
362            }
363    
364            /**
365             * Returns the divide feature vector
366             * 
367             * @return a feature vector object
368             */
369            public FeatureVector getDivideFeatureVector() {
370                    return divideFeatureVector;
371            }
372            
373            /**
374             * Returns the frequency (number of instances)
375             * 
376             * @return the frequency (number of instances)
377             */
378            public int getFrequency() {
379                    return frequency;
380            }
381    
382            /**
383             * Increase the frequency by 1
384             */
385            public void increaseFrequency() {
386                    if (parent instanceof InstanceModel) {
387                            ((InstanceModel)parent).increaseFrequency();
388                    }
389                    frequency++;
390            }
391            
392            public void decreaseFrequency() {
393                    if (parent instanceof InstanceModel) {
394                            ((InstanceModel)parent).decreaseFrequency();
395                    }
396                    frequency--;
397            }
398            
399            /**
400             * Sets the frequency (number of instances)
401             * 
402             * @param frequency (number of instances)
403             */
404            protected void setFrequency(int frequency) {
405                    this.frequency = frequency;
406            }
407    
408    
409            /* (non-Javadoc)
410             * @see java.lang.Object#toString()
411             */
412            public String toString() {
413                    final StringBuilder sb = new StringBuilder();
414                    //TODO
415                    return sb.toString();
416            }
417    }