001    package org.maltparser.core.feature.spec.reader;
002    
003    import java.io.BufferedReader;
004    import java.io.IOException;
005    import java.io.InputStreamReader;
006    import java.net.URL;
007    import java.util.ArrayList;
008    import java.util.EnumMap;
009    import java.util.regex.Pattern;
010    
011    import org.maltparser.core.exception.MaltChainedException;
012    import org.maltparser.core.feature.FeatureException;
013    import org.maltparser.core.feature.spec.SpecificationModels;
014    /**
015    *
016    *
017    * @author Johan Hall
018    */
019    public class ParReader implements FeatureSpecReader {
020            public enum DataStructures {
021                    STACK, INPUT, LEFTCONTEXT, RIGHTCONTEXT
022            };
023            public enum ColumnNames {
024                    POS, DEP, LEX, LEMMA, CPOS, FEATS
025            };
026            private EnumMap<ColumnNames, String> columnNameMap;
027            private EnumMap<DataStructures, String> dataStructuresMap;
028            private boolean useSplitFeats = true;
029            private boolean covington = false;
030            private boolean pppath;
031            private boolean pplifted;
032            private boolean ppcoveredRoot;
033            
034            public ParReader() throws MaltChainedException {
035                    initializeColumnNameMap();
036                    initializeDataStructuresMap();
037                    setPppath(false);
038                    setPplifted(false);
039                    setPpcoveredRoot(false);
040            }
041            
042            public void load(URL specModelURL, SpecificationModels featureSpecModels) throws MaltChainedException {
043                    BufferedReader br = null;
044                    Pattern tabPattern = Pattern.compile("\t");
045                    if (specModelURL == null) {
046                            throw new FeatureException("The feature specification file cannot be found. ");
047                    }
048                    try {
049                            br = new BufferedReader(new InputStreamReader(specModelURL.openStream()));
050                    } catch (IOException e) {
051                            throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
052                    }               
053                    
054                    if (br != null) {
055                            int specModelIndex = featureSpecModels.getNextIndex();
056                            String fileLine;
057                            String items[];
058                            StringBuilder featureText = new StringBuilder();
059                            String splitfeats = "";
060                            ArrayList<String> fileLines = new ArrayList<String>();
061                            ArrayList<String> orderFileLines = new ArrayList<String>();
062                            while (true) {
063                                    try {
064                                            fileLine = br.readLine();
065                                    } catch (IOException e) {
066                                            throw new FeatureException("Could not read the feature specification file '"+specModelURL.toString()+"'. ", e);
067                                    }
068                                    if (fileLine == null) {
069                                            break;
070                                    }
071                                    if (fileLine.length() <= 1 && fileLine.trim().substring(0, 2).trim().equals("--")) {
072                                            continue;
073                                    }
074                                    fileLines.add(fileLine);
075                            }
076                            try {
077                                    br.close();
078                            } catch (IOException e) {
079                                    throw new FeatureException("Could not close the feature specification file '"+specModelURL.toString()+"'. ", e);
080                            }
081    
082                            for (int j = 0; j < fileLines.size(); j++) {
083                                    orderFileLines.add(fileLines.get(j));
084                            }
085    
086                            boolean deprel = false;
087                            for (int j=0; j < orderFileLines.size(); j++) {
088                                    deprel = false;
089                                    featureText.setLength(0);
090                                    splitfeats = "";
091                                    items = tabPattern.split(orderFileLines.get(j));
092                                    if (items.length < 2) {
093                                            throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' must contain at least two columns.");
094                                    }
095                                    if (!(columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim())) || columnNameMap.containsValue(items[0].trim()))) {
096                                            throw new FeatureException("Column one in the feature specification file '"+specModelURL.toString()+"' contains an unknown value '"+items[0].trim()+"'. ");
097                                    }
098                                    if (items[0].trim().equalsIgnoreCase("DEP") || items[0].trim().equalsIgnoreCase("DEPREL")) {
099                                            featureText.append("OutputColumn(DEPREL, ");
100                                            deprel = true;
101                                    } else {
102                                            if (columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim()))) {
103                                                    featureText.append("InputColumn("+columnNameMap.get(ColumnNames.valueOf(items[0].trim()))+", ");
104                                            } else if (columnNameMap.containsValue(items[0].trim())) {
105                                                    featureText.append("InputColumn("+items[0].trim()+", ");
106                                            }
107                                            if (items[0].trim().equalsIgnoreCase("FEATS") && isUseSplitFeats()) {
108                                                    splitfeats = "Split(";
109                                            }
110                                    }
111                                    if (!(items[1].trim().equalsIgnoreCase("STACK") || items[1].trim().equalsIgnoreCase("INPUT") || items[1].trim().equalsIgnoreCase("CONTEXT"))) {
112                                            throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should be either 'STACK', 'INPUT' or 'CONTEXT' (Covington), not '"+items[1].trim()+"'. ");
113                                    }
114                                    int offset = 0;
115                                    if (items.length >= 3) {
116                                            try {
117                                                    offset = new Integer(Integer.parseInt(items[2]));
118                                            } catch (NumberFormatException e) {
119                                                    throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' contains a illegal integer value. ", e);
120                                            }
121                                    }
122                                    String functionArg = "";
123                                    
124                                    if (items[1].trim().equalsIgnoreCase("CONTEXT")) {
125                                            if (offset >= 0) {
126                                                    functionArg = dataStructuresMap.get(DataStructures.valueOf("LEFTCONTEXT"))+"["+offset+"]";
127                                            } else {
128                                                    functionArg = dataStructuresMap.get(DataStructures.valueOf("RIGHTCONTEXT"))+"["+Math.abs(offset + 1)+"]";
129                                            }
130                                    } else if (dataStructuresMap.containsKey(DataStructures.valueOf(items[1].trim()))) {
131                                            if (covington == true) {
132                                                    if (dataStructuresMap.get(DataStructures.valueOf(items[1].trim())).equalsIgnoreCase("Stack")) {
133                                                            functionArg = "Left["+offset+"]";
134                                                    } else {
135                                                            functionArg = "Right["+offset+"]";
136                                                    }
137                                            } else {
138                                                    functionArg = dataStructuresMap.get(DataStructures.valueOf(items[1].trim()))+"["+offset+"]";
139                                            }
140                                    } else if (dataStructuresMap.containsValue(items[1].trim())) {
141                                            if (covington == true) {
142                                                    if (items[1].trim().equalsIgnoreCase("Stack")) {
143                                                            functionArg = "Left["+offset+"]";
144                                                    } else {
145                                                            functionArg = "Right["+offset+"]";
146                                                    }
147                                            } else {
148                                                    functionArg = items[1].trim()+"["+offset+"]";
149                                            }
150                                            
151                                    } else {
152                                            throw new FeatureException("Column two in the feature specification file '"+specModelURL.toString()+"' should not contain the value '"+items[1].trim());
153                                    }
154            
155                                    int linearOffset = 0;
156                                    int headOffset = 0;
157                                    int depOffset = 0;
158                                    int sibOffset = 0;
159                                    int suffixLength = 0;
160                                    if (items.length >= 4) { linearOffset = new Integer(Integer.parseInt(items[3])); }
161                                    if (items.length >= 5) { headOffset = new Integer(Integer.parseInt(items[4])); }
162                                    if (items.length >= 6) { depOffset = new Integer(Integer.parseInt(items[5])); }
163                                    if (items.length >= 7) { sibOffset = new Integer(Integer.parseInt(items[6])); }
164                                    if (items.length >= 8) { suffixLength = new Integer(Integer.parseInt(items[7])); }
165                                    if (linearOffset < 0) {
166                                            linearOffset = Math.abs(linearOffset);
167                                            for (int i = 0; i < linearOffset; i++) {
168                                                    functionArg = "pred("+functionArg+")"; 
169                                            }
170                                    } else if (linearOffset > 0) {
171                                            for (int i = 0; i < linearOffset; i++) {
172                                                    functionArg = "succ("+functionArg+")"; 
173                                            }
174                                    } 
175                                    if (headOffset >= 0) {
176                                            for (int i = 0; i < headOffset; i++) {
177                                                    functionArg = "head("+functionArg+")"; 
178                                            }
179                                    } else {
180                                            throw new FeatureException("The feature specification file '"+specModelURL.toString()+"' should not contain a negative head function value. ");
181                                    }
182                                    if (depOffset < 0) {
183                                            depOffset = Math.abs(depOffset);
184                                            for (int i = 0; i < depOffset; i++) {
185                                                    functionArg = "ldep("+functionArg+")"; 
186                                            }
187                                    } else if (depOffset > 0) {
188                                            for (int i = 0; i < depOffset; i++) {
189                                                    functionArg = "rdep("+functionArg+")";
190                                            }                                                       
191                                    }
192                                    if (sibOffset < 0) {
193                                            sibOffset = Math.abs(sibOffset);
194                                            for (int i = 0; i < sibOffset; i++) {
195                                                    functionArg = "lsib("+functionArg+")"; 
196                                            }
197                                    } else if (sibOffset > 0) {
198                                            for (int i = 0; i < sibOffset; i++) {
199                                                    functionArg = "rsib("+functionArg+")"; 
200                                            }                                                       
201                                    }
202                                    
203                                    if (deprel == true && (pppath == true || pplifted == true || ppcoveredRoot == true)) {
204                                            featureSpecModels.add(specModelIndex, mergePseudoProjColumns(functionArg));
205                                    } else {
206                                            if (suffixLength != 0) {
207                                                    featureSpecModels.add(specModelIndex, "Suffix("+featureText.toString()+functionArg+"),"+suffixLength+")");
208                                            } else if (splitfeats.equals("Split(")) {
209                                                    featureSpecModels.add(specModelIndex, splitfeats+featureText.toString()+functionArg+"),\\|)");
210                                            } else {
211                                                    featureSpecModels.add(specModelIndex, featureText.toString()+functionArg+")");
212                                            }
213                                    }
214    
215                            }
216                    }
217            }
218    
219            private String mergePseudoProjColumns(String functionArg) {
220                    StringBuilder newFeatureText = new StringBuilder();
221                    int c = 1; 
222                    
223                    if (pplifted == true) { c++; };
224                    if (pppath == true) { c++; };
225                    if (ppcoveredRoot == true) { c++; };
226                    
227                    if (c == 1) { // no merge
228                            newFeatureText.append("OutputColumn(DEPREL, ");
229                            newFeatureText.append(functionArg);
230                            newFeatureText.append(')');
231                            return newFeatureText.toString();
232                    }
233                    if (c == 2) {
234                            newFeatureText.append("Merge(");
235                            newFeatureText.append("OutputColumn(DEPREL, ");
236                            newFeatureText.append(functionArg);
237                            newFeatureText.append("), ");
238                            if (pplifted == true) {
239                                    newFeatureText.append("OutputTable(PPLIFTED, ");
240                                    newFeatureText.append(functionArg);
241                                    newFeatureText.append(")");
242                            }
243                            if (pppath == true) {
244                                    newFeatureText.append("OutputTable(PPPATH, ");
245                                    newFeatureText.append(functionArg);
246                                    newFeatureText.append(")");
247                            }
248                            if (ppcoveredRoot == true) {
249                                    newFeatureText.append("OutputTable(PPCOVERED, ");
250                                    newFeatureText.append(functionArg);
251                                    newFeatureText.append(")");
252                            }
253                            newFeatureText.append(")");
254                    } else if (c == 3) { // use Merge3 
255                            int i = 0;
256                            newFeatureText.append("Merge3(");
257                            newFeatureText.append("OutputColumn(DEPREL, ");
258                            newFeatureText.append(functionArg);
259                            newFeatureText.append("), ");
260                            i++;
261                            if (pplifted == true) {
262                                    newFeatureText.append("OutputTable(PPLIFTED, ");
263                                    newFeatureText.append(functionArg);
264                                    i++;
265                                    if (i<3) { 
266                                            newFeatureText.append("), ");
267                                    } else {
268                                            newFeatureText.append(")");
269                                    }
270                            }
271                            if (pppath == true) {
272                                    newFeatureText.append("OutputTable(PPPATH, ");
273                                    newFeatureText.append(functionArg);
274                                    i++;
275                                    if (i<3) { 
276                                            newFeatureText.append("), ");
277                                    } else {
278                                            newFeatureText.append(")");
279                                    }
280                            }
281                            if (ppcoveredRoot == true) {
282                                    newFeatureText.append("OutputTable(PPCOVERED, ");
283                                    newFeatureText.append(functionArg);
284                                    i++;
285                                    if (i<3) { 
286                                            newFeatureText.append("), ");
287                                    } else {
288                                            newFeatureText.append(")");
289                                    }
290                            }
291                            newFeatureText.append(")");
292                    } else { // c == 4
293                            newFeatureText.append("Merge(Merge(");
294                            newFeatureText.append("OutputColumn(DEPREL, ");
295                            newFeatureText.append(functionArg);
296                            newFeatureText.append("), ");
297                            newFeatureText.append("OutputTable(PPLIFTED, ");
298                            newFeatureText.append(functionArg);
299                            newFeatureText.append(")), Merge(");
300                            newFeatureText.append("OutputTable(PPPATH, ");
301                            newFeatureText.append(functionArg);
302                            newFeatureText.append("), ");
303                            newFeatureText.append("OutputTable(PPCOVERED, ");
304                            newFeatureText.append(functionArg);
305                            newFeatureText.append(")))");
306                    }
307                    return newFeatureText.toString();
308            }
309            
310            public EnumMap<ColumnNames, String> getColumnNameMap() {
311                    return columnNameMap;
312            }
313    
314            public void initializeColumnNameMap() {
315                    columnNameMap = new EnumMap<ColumnNames, String>(ColumnNames.class);
316                    columnNameMap.put(ColumnNames.POS, "POSTAG");
317                    columnNameMap.put(ColumnNames.CPOS, "CPOSTAG");
318                    columnNameMap.put(ColumnNames.DEP, "DEPREL");
319                    columnNameMap.put(ColumnNames.LEX, "FORM");
320                    columnNameMap.put(ColumnNames.LEMMA, "LEMMA");
321                    columnNameMap.put(ColumnNames.FEATS, "FEATS");
322            }
323    
324            public void setColumnNameMap(EnumMap<ColumnNames, String> columnNameMap) {
325                    this.columnNameMap = columnNameMap;
326            }
327            
328            public EnumMap<DataStructures, String> getDataStructuresMap() {
329                    return dataStructuresMap;
330            }
331    
332            //TODO Fix covington
333            public void initializeDataStructuresMap() {
334                    dataStructuresMap = new EnumMap<DataStructures, String>(DataStructures.class);
335                    dataStructuresMap.put(DataStructures.STACK, "Stack");
336                    dataStructuresMap.put(DataStructures.INPUT, "Input");
337            }
338    
339            public void setDataStructuresMap(EnumMap<DataStructures, String> dataStructuresMap) {
340                    this.dataStructuresMap = dataStructuresMap;
341            }
342            
343            public boolean isUseSplitFeats() {
344                    return useSplitFeats;
345            }
346    
347            public void setUseSplitFeats(boolean useSplitFeats) {
348                    this.useSplitFeats = useSplitFeats;
349            }
350    
351            public boolean isCovington() {
352                    return covington;
353            }
354    
355            public void setCovington(boolean covington) {
356                    this.covington = covington;
357            }
358    
359            public boolean isPppath() {
360                    return pppath;
361            }
362    
363            public void setPppath(boolean pppath) {
364                    this.pppath = pppath;
365            }
366    
367            public boolean isPplifted() {
368                    return pplifted;
369            }
370    
371            public void setPplifted(boolean pplifted) {
372                    this.pplifted = pplifted;
373            }
374    
375            public boolean isPpcoveredRoot() {
376                    return ppcoveredRoot;
377            }
378    
379            public void setPpcoveredRoot(boolean ppcoveredRoot) {
380                    this.ppcoveredRoot = ppcoveredRoot;
381            }
382    
383            public String toString() {
384                    StringBuilder sb = new StringBuilder();
385                    sb.append("Mapping of column names:\n");
386                    for (ColumnNames columnName : ColumnNames.values()) {
387                            sb.append(columnName.toString()+"\t"+columnNameMap.get(columnName)+"\n");
388                    }
389                    sb.append("Mapping of data structures:\n");
390                    for (DataStructures dataStruct : DataStructures.values()) {
391                            sb.append(dataStruct.toString()+"\t"+dataStructuresMap.get(dataStruct)+"\n");
392                    }
393                    sb.append("Split FEATS column: "+useSplitFeats+"\n");
394                    return sb.toString();
395            }
396    }