001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.Iterator;
011    import java.util.LinkedHashMap;
012    import java.util.SortedMap;
013    import java.util.TreeMap;
014    import java.util.regex.PatternSyntaxException;
015    
016    import org.maltparser.core.exception.MaltChainedException;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.PhraseStructure;
021    import org.maltparser.core.syntaxgraph.TokenStructure;
022    import org.maltparser.core.syntaxgraph.edge.Edge;
023    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
024    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
025    import org.maltparser.ml.libsvm.LibsvmException;
026    /**
027    *
028    *
029    * @author Johan Hall
030    */
031    public class NegraWriter implements SyntaxGraphWriter {
032            private BufferedWriter writer; 
033            private DataFormatInstance dataFormatInstance;
034            private String optionString;
035            private int sentenceCount;
036            private LinkedHashMap<Integer, Integer> nonTerminalIndexMap;
037            private int START_ID_OF_NONTERMINALS = 500;
038            private boolean closeStream = true;
039            
040            public NegraWriter() { 
041                    nonTerminalIndexMap = new LinkedHashMap<Integer, Integer>();
042            }
043            
044            public void open(String fileName, String charsetName) throws MaltChainedException {
045                    try {
046                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
047                    } catch (FileNotFoundException e) {
048                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
049                    } catch (UnsupportedEncodingException e) {
050                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
051                    }       
052            }
053            
054            public void open(OutputStream os, String charsetName) throws MaltChainedException {
055                    try {
056                            if (os == System.out || os == System.err) {
057                                    closeStream = false;
058                            }
059                            open(new OutputStreamWriter(os, charsetName));
060                    } catch (UnsupportedEncodingException e) {
061                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
062                    }
063            }
064            
065            private void open(OutputStreamWriter osw) throws MaltChainedException {
066                    setWriter(new BufferedWriter(osw));
067                    setSentenceCount(0);
068            }
069            
070            public void writeProlog() throws MaltChainedException { }
071            
072            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
073                    if (syntaxGraph == null || dataFormatInstance == null || !(syntaxGraph instanceof PhraseStructure) || !syntaxGraph.hasTokens()) {
074                            return;
075                    }
076                    PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
077                    sentenceCount++;
078                    try {
079                            writer.write("#BOS ");
080                            if (phraseStructure.getSentenceID() != 0) {
081                                    writer.write(Integer.toString(phraseStructure.getSentenceID()));
082                            } else {
083                                    writer.write(Integer.toString(sentenceCount));
084                            }
085                            writer.write('\n');
086    
087                            if (phraseStructure.hasNonTerminals()) {
088                                    calculateIndices(phraseStructure);
089                                    writeTerminals(phraseStructure);
090                                    writeNonTerminals(phraseStructure);
091                            } else {
092                                    writeTerminals(phraseStructure);
093                            }
094                            writer.write("#EOS ");
095                            if (phraseStructure.getSentenceID() != 0) {
096                                    writer.write(Integer.toString(phraseStructure.getSentenceID()));
097                            } else {
098                                    writer.write(Integer.toString(sentenceCount));
099                            }
100                            writer.write('\n');
101                    } catch (IOException e) {
102                            throw new DataFormatException("Could not write to the output file. ", e);
103                    }
104            }
105            public void writeEpilog() throws MaltChainedException { }
106            
107    
108            private void calculateIndices(PhraseStructure phraseStructure) throws MaltChainedException {
109                    final SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
110                    for (int index : phraseStructure.getNonTerminalIndices()) {
111                            heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
112                    }
113                    
114                    boolean done = false;
115                    int h = 1;
116                    int ntid = START_ID_OF_NONTERMINALS;
117                    nonTerminalIndexMap.clear();
118                    while (!done) {
119                            done = true;
120                            for (int index : phraseStructure.getNonTerminalIndices()) {
121                                    if (heights.get(index) == h) {
122                                            NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
123                                            nonTerminalIndexMap.put(nt.getIndex(), ntid++);
124    //                                      nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
125                                            done = false;
126                                    }
127                            }
128                            h++;
129                    }
130                    
131    //              boolean done = false;
132    //              int h = 1;
133    ////            int ntid = START_ID_OF_NONTERMINALS;
134    ////            nonTerminalIndexMap.clear();
135    //              while (!done) {
136    //                      done = true;
137    //                      for (int index : phraseStructure.getNonTerminalIndices()) {
138    //                              if (heights.get(index) == h) {
139    //                                      NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
140    ////                                    nonTerminalIndexMap.put(nt.getIndex(), ntid++);
141    //                                      nonTerminalIndexMap.put(nt.getIndex(), nt.getIndex()+START_ID_OF_NONTERMINALS-1);
142    //                                      done = false;
143    //                              }
144    //                      }
145    //                      h++;
146    //              }
147            }
148            
149            private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
150                    try {
151                            for (int index : phraseStructure.getTokenIndices()) {
152                                    final PhraseStructureNode terminal = phraseStructure.getTokenNode(index);
153                                    final Iterator<ColumnDescription> columns = dataFormatInstance.iterator();
154                                    ColumnDescription column = null;
155                                    int ti = 1;
156                                    while (columns.hasNext()) {
157                                            column = columns.next();
158                                            if (column.getCategory() == ColumnDescription.INPUT) {
159                                                    writer.write(terminal.getLabelSymbol(column.getSymbolTable()));
160                                                    int nTabs = 1;
161                                                    if (ti == 1 || ti == 2) {
162                                                            nTabs = 3 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
163                                                    } else if (ti == 3) {
164                                                            nTabs = 1;
165                                                    } else if (ti == 4) {
166                                                            nTabs = 2 - (terminal.getLabelSymbol(column.getSymbolTable()).length() / 8);
167                                                    }
168                                                    if (nTabs < 1) {
169                                                            nTabs = 1;
170                                                    }
171                                                    for (int j = 0; j < nTabs; j++) {
172                                                            writer.write('\t');
173                                                    }
174                                                    ti++;
175                                            } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) {
176                                                    if (terminal.getParent() != null && terminal.hasParentEdgeLabel(column.getSymbolTable())) {
177                                                            writer.write(terminal.getParentEdgeLabelSymbol(column.getSymbolTable()));
178                                                            writer.write('\t');
179                                                    } else {
180                                                            writer.write("--\t");
181                                                    }
182                                            } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL) { 
183                                                    if (terminal.getParent() == null || terminal.getParent() == phraseStructure.getPhraseStructureRoot()) {
184                                                            writer.write('0');
185                                                    } else {
186                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(terminal.getParent().getIndex())));
187    //                                                      writer.write(Integer.toString(terminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
188                                                    }
189                                            }
190                                    }
191                                    for (Edge e : terminal.getIncomingSecondaryEdges()) {
192                                            if (e.hasLabel(column.getSymbolTable())) {
193                                                    writer.write('\t');
194                                                    writer.write(e.getLabelSymbol(column.getSymbolTable()));
195                                                    writer.write('\t');
196                                                    if (e.getSource() instanceof NonTerminalNode) {
197                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
198    //                                                      writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
199                                                    } else {
200                                                            writer.write(Integer.toString(e.getSource().getIndex()));
201                                                    }
202                                            }
203                                    }
204                                    writer.write("\n");
205                            }
206    
207                    } catch (IOException e) {
208                            throw new DataFormatException("The Negra writer is not able to write. ", e);
209                    }
210            }
211            
212            private void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
213                    for (int index : nonTerminalIndexMap.keySet()) {
214    //              for (int index : phraseStructure.getNonTerminalIndices()) {
215                            NonTerminalNode nonTerminal = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
216            
217                            if (nonTerminal == null || nonTerminal.isRoot()) {
218                                    return;
219                            }
220                            try {
221                                    writer.write('#');
222    //                              writer.write(Integer.toString(index+START_ID_OF_NONTERMINALS-1));
223                                    writer.write(Integer.toString(nonTerminalIndexMap.get(index)));
224                                    writer.write("\t\t\t--\t\t\t");
225                                    if (nonTerminal.hasLabel(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable())) {
226                                            writer.write(nonTerminal.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("CAT").getSymbolTable()));
227                                    } else {
228                                            writer.write("--");
229                                    }
230                                    writer.write("\t--\t\t");
231                                    if (nonTerminal.hasParentEdgeLabel(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable())) {
232                                            writer.write(nonTerminal.getParentEdgeLabelSymbol(dataFormatInstance.getColumnDescriptionByName("LABEL").getSymbolTable()));
233                                    } else {
234                                            writer.write("--");
235                                    }
236                                    writer.write('\t');
237                                    if (nonTerminal.getParent() == null || nonTerminal.getParent().isRoot()) {
238                                            writer.write('0');
239                                    } else {
240    //                                      writer.write(Integer.toString(nonTerminal.getParent().getIndex()+START_ID_OF_NONTERMINALS-1));
241                                            writer.write(Integer.toString(nonTerminalIndexMap.get(nonTerminal.getParent().getIndex())));
242                                    }
243                                    for (Edge e : nonTerminal.getIncomingSecondaryEdges()) {
244                                            if (e.hasLabel(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable())) {
245                                                    writer.write('\t');
246                                                    writer.write(e.getLabelSymbol(dataFormatInstance.getColumnDescriptionByName("SECEDGELABEL").getSymbolTable()));
247                                                    writer.write('\t');
248                                                    if (e.getSource() instanceof NonTerminalNode) {
249    //                                                      writer.write(Integer.toString(e.getSource().getIndex()+START_ID_OF_NONTERMINALS-1));
250                                                            writer.write(Integer.toString(nonTerminalIndexMap.get(e.getSource().getIndex())));
251                                                    } else {
252                                                            writer.write(Integer.toString(e.getSource().getIndex()));
253                                                    }
254                                            }
255                                    }
256                                    writer.write("\n");
257                            } catch (IOException e) {
258                                    throw new DataFormatException("The Negra writer is not able to write the non-terminals. ", e);
259                            }
260                    }
261            }
262            
263            public BufferedWriter getWriter() {
264                    return writer;
265            }
266    
267            public void setWriter(BufferedWriter writer) {
268                    this.writer = writer;
269            }
270            
271            public int getSentenceCount() {
272                    return sentenceCount;
273            }
274    
275            public void setSentenceCount(int sentenceCount) {
276                    this.sentenceCount = sentenceCount;
277            }
278            
279            public DataFormatInstance getDataFormatInstance() {
280                    return dataFormatInstance;
281            }
282    
283            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
284                    this.dataFormatInstance = dataFormatInstance;
285            }
286    
287            public String getOptions() {
288                    return optionString;
289            }
290            
291            public void setOptions(String optionString) throws MaltChainedException {
292                    this.optionString = optionString;
293                    String[] argv;
294                    try {
295                            argv = optionString.split("[_\\p{Blank}]");
296                    } catch (PatternSyntaxException e) {
297                            throw new DataFormatException("Could not split the penn writer option '"+optionString+"'. ", e);
298                    }
299                    for (int i=0; i < argv.length-1; i++) {
300                            if(argv[i].charAt(0) != '-') {
301                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
302                            }
303                            if(++i>=argv.length) {
304                                    throw new DataFormatException("The last argument does not have any value. ");
305                            }
306                            switch(argv[i-1].charAt(1)) {
307                            case 's': 
308                                    try {
309                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
310                                    } catch (NumberFormatException e){
311                                            throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. ");
312                                    }
313                                    break;
314                            default:
315                                    throw new LibsvmException("Unknown svm parameter: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");         
316                            }
317                    }       
318            }
319            
320            public void close() throws MaltChainedException {
321                    try {
322                            if (writer != null) {
323                                    writer.flush();
324                                    if (closeStream) {
325                                            writer.close();
326                                    }
327                                    writer = null;
328                            }
329                    }   catch (IOException e) {
330                            throw new DataFormatException("Could not close the output file. ", e);
331                    } 
332            }
333    }