001    package org.maltparser.core.syntaxgraph.writer;
002    
003    import java.io.BufferedWriter;
004    import java.io.FileNotFoundException;
005    import java.io.FileOutputStream;
006    import java.io.IOException;
007    import java.io.OutputStream;
008    import java.io.OutputStreamWriter;
009    import java.io.UnsupportedEncodingException;
010    import java.util.SortedMap;
011    import java.util.TreeMap;
012    import java.util.regex.PatternSyntaxException;
013    
014    import org.maltparser.core.exception.MaltChainedException;
015    
016    import org.maltparser.core.helper.Util;
017    import org.maltparser.core.io.dataformat.ColumnDescription;
018    import org.maltparser.core.io.dataformat.DataFormatException;
019    import org.maltparser.core.io.dataformat.DataFormatInstance;
020    import org.maltparser.core.syntaxgraph.PhraseStructure;
021    import org.maltparser.core.syntaxgraph.TokenStructure;
022    import org.maltparser.core.syntaxgraph.node.NonTerminalNode;
023    import org.maltparser.core.syntaxgraph.node.PhraseStructureNode;
024    import org.maltparser.core.syntaxgraph.node.TokenNode;
025    import org.maltparser.core.syntaxgraph.reader.TigerXMLHeader;
026    import org.maltparser.ml.libsvm.LibsvmException;
027    /**
028    *
029    *
030    * @author Johan Hall
031    */
032    public class TigerXMLWriter implements SyntaxGraphWriter {
033            private enum RootHandling {
034                    TALBANKEN, NORMAL  
035            };
036    
037            private BufferedWriter writer;
038            private DataFormatInstance dataFormatInstance;
039            private String optionString;
040            private int sentenceCount;
041            private TigerXMLHeader header;
042    //      private boolean hasWriteTigerXMLHeader = false;
043            private RootHandling rootHandling;
044            private String sentencePrefix = "s";
045            private StringBuilder sentenceID;
046            private StringBuilder tmpID;
047            private StringBuilder rootID;
048            private int START_ID_OF_NONTERMINALS = 500;
049            private boolean labeledTerminalID;
050            private String VROOT_SYMBOL = "VROOT";
051            private boolean useVROOT = false;
052    //      private String fileName = null;
053    //      private String charsetName = null;
054            private boolean closeStream = true;
055            
056            public TigerXMLWriter() { 
057                    sentenceID = new StringBuilder();
058                    tmpID = new StringBuilder();
059                    rootID = new StringBuilder();
060                    labeledTerminalID = false;
061            }
062            
063            public void open(String fileName, String charsetName) throws MaltChainedException {
064                    try {
065    //                      this.fileName = fileName;
066    //                      this.charsetName = charsetName;
067                            open(new OutputStreamWriter(new FileOutputStream(fileName),charsetName));
068                    } catch (FileNotFoundException e) {
069                            throw new DataFormatException("The output file '"+fileName+"' cannot be found.", e);
070                    } catch (UnsupportedEncodingException e) {
071                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
072                    }       
073            }
074            
075            public void open(OutputStream os, String charsetName) throws MaltChainedException {
076                    try {
077                            if (os == System.out || os == System.err) {
078                                    closeStream = false;
079                            }
080                            open(new OutputStreamWriter(os, charsetName));
081                    } catch (UnsupportedEncodingException e) {
082                            throw new DataFormatException("The character encoding set '"+charsetName+"' isn't supported.", e);
083                    }
084            }
085            
086            private void open(OutputStreamWriter osw) throws MaltChainedException {
087                    setWriter(new BufferedWriter(osw));
088                    setSentenceCount(0);
089            }
090            
091            public void writeProlog() throws MaltChainedException { 
092    //              if (fileName == null || charsetName == null) {
093                            writeHeader();
094    //              }
095            }
096            
097            public void writeSentence(TokenStructure syntaxGraph) throws MaltChainedException {
098                    if (syntaxGraph == null || dataFormatInstance == null) {
099                            return;
100                    }
101                    if (syntaxGraph.hasTokens()) {
102                            sentenceCount++;
103                            final PhraseStructure phraseStructure = (PhraseStructure)syntaxGraph;
104                            try {
105                                    sentenceID.setLength(0);
106                                    sentenceID.append(sentencePrefix);
107                                    if (phraseStructure.getSentenceID() != 0) {
108                                            sentenceID.append(Integer.toString(phraseStructure.getSentenceID()));
109                                    } else {
110                                            sentenceID.append(Integer.toString(sentenceCount));
111                                    }
112                                    writer.write("    <s id=\"");
113                                    writer.write(sentenceID.toString());  
114                                    writer.write("\">\n");
115                                    
116                                    setRootID(phraseStructure);
117                                    writer.write("      <graph root=\"");
118                                    writer.write(rootID.toString());
119                                    writer.write("\" ");
120                                    writer.write("discontinuous=\"");
121                                    writer.write(Boolean.toString(!phraseStructure.isContinuous()));
122                                    writer.write("\">\n");
123                                    
124                                    writeTerminals(phraseStructure);
125                                    if (phraseStructure.nTokenNode() != 1 || rootHandling.equals(RootHandling.TALBANKEN)) {
126                                            writeNonTerminals(phraseStructure);
127                                    } else {
128                                            writer.write("        <nonterminals/>\n");
129                                    }
130                                    writer.write("      </graph>\n");
131                                    writer.write("    </s>\n");
132                            } catch (IOException e) {
133                                    throw new DataFormatException("The TigerXML writer could not write to file. ", e);
134                            }
135                    }
136            }
137            
138            private void setRootID(PhraseStructure phraseStructure) throws MaltChainedException {
139                    useVROOT = false;
140                    PhraseStructureNode root = phraseStructure.getPhraseStructureRoot();
141                    for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
142                            if (root.hasLabel(column.getSymbolTable()) && root.getLabelSymbol(column.getSymbolTable()).equals(VROOT_SYMBOL)) {
143                                    useVROOT = true;
144                                    break;
145                            }
146                    }
147                    if (useVROOT) {
148                            rootID.setLength(0);
149                            rootID.append(sentenceID);
150                            rootID.append('_');
151                            rootID.append(VROOT_SYMBOL);
152                    } else if (phraseStructure.nTokenNode() == 1 && phraseStructure.nNonTerminals() == 0 && !root.isLabeled()) {
153                            rootID.setLength(0);
154                            rootID.append(sentenceID);
155                            rootID.append("_1");
156                    } else {
157                            rootID.setLength(0);
158                            rootID.append(sentenceID);
159                            rootID.append('_');
160    //                      if (rootHandling.equals(RootHandling.NORMAL)) { 
161                                    rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+phraseStructure.nNonTerminals()));
162    //                      } else if (rootHandling.equals(RootHandling.TALBANKEN)) {
163    //                              rootID.append(Integer.toString(START_ID_OF_NONTERMINALS+1));
164    //                      }
165                    }
166    
167            }
168            
169            public void writeEpilog() throws MaltChainedException { 
170                    writeTail();
171            }
172            
173            public BufferedWriter getWriter() {
174                    return writer;
175            }
176    
177            public void setWriter(BufferedWriter writer) {
178                    this.writer = writer;
179            }
180            
181            public void close() throws MaltChainedException {
182                    try {
183                            if (writer != null) {
184                                    writer.flush();
185                                    if (closeStream) {
186                                            writer.close();
187                                    }
188                                    writer = null;
189                            }
190                    }   catch (IOException e) {
191                            throw new DataFormatException("Could not close the output file. ", e);
192                    } 
193            }
194            
195            private void writeHeader() throws MaltChainedException {
196                    try {
197                            if (header == null) {
198                                    header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
199                            }
200                            writer.write(header.toTigerXML());
201    //                      hasWriteTigerXMLHeader = true;
202                    } catch (IOException e) {
203                            throw new DataFormatException("The TigerXML writer could not write to file. ", e);
204                    }
205            }
206            
207            
208            private void writeTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
209                    try {
210                            writer.write("        <terminals>\n");
211                            for (int index : phraseStructure.getTokenIndices()) {
212                                    final PhraseStructureNode t = phraseStructure.getTokenNode(index);
213                                    writer.write("          <t ");
214                                    if (!labeledTerminalID) {
215                                            tmpID.setLength(0);
216                                            tmpID.append(sentenceID);
217                                            tmpID.append('_');
218                                            tmpID.append(Integer.toString(t.getIndex()));
219                                            writer.write("id=\"");writer.write(tmpID.toString());writer.write("\" ");
220                                    }
221                                    
222                                    for (ColumnDescription column : dataFormatInstance.getInputColumnDescriptionSet()) {
223                                            writer.write(column.getName().toLowerCase());
224                                            writer.write("=\"");
225                                            writer.write(Util.xmlEscape(t.getLabelSymbol(column.getSymbolTable())));
226                                            writer.write("\" ");    
227                                    }
228                                    writer.write("/>\n");
229                            }
230                            writer.write("        </terminals>\n");
231                    } catch (IOException e) {
232                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
233                    }
234            }
235            
236            public void writeNonTerminals(PhraseStructure phraseStructure) throws MaltChainedException {
237                    try {
238                            SortedMap<Integer,Integer> heights = new TreeMap<Integer,Integer>();
239                            for (int index : phraseStructure.getNonTerminalIndices()) {
240                                    heights.put(index, ((NonTerminalNode)phraseStructure.getNonTerminalNode(index)).getHeight());
241                            }
242                            writer.write("        <nonterminals>\n");
243                            boolean done = false;
244                            int h = 1;
245                            while (!done) {
246                                    done = true;
247                                    for (int index : phraseStructure.getNonTerminalIndices()) {
248                                            if (heights.get(index) == h) {
249                                                    NonTerminalNode nt = (NonTerminalNode)phraseStructure.getNonTerminalNode(index);
250                                                    tmpID.setLength(0);
251                                                    tmpID.append(sentenceID);
252                                                    tmpID.append('_');
253                                                    tmpID.append(Integer.toString(nt.getIndex()+START_ID_OF_NONTERMINALS-1));
254                                                    writeNonTerminal(nt, tmpID.toString());
255                                                    done = false;
256                                            }
257                                    }
258                                    h++;
259                            }
260                            
261                            writeNonTerminal((NonTerminalNode)phraseStructure.getPhraseStructureRoot(),rootID.toString());
262                            writer.write("        </nonterminals>\n");
263                    } catch (IOException e) {
264                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
265                    }
266            }
267            
268            public void writeNonTerminal(NonTerminalNode nt, String id) throws MaltChainedException {
269                    try {
270                            writer.write("          <nt");
271                            writer.write(" id=\"");writer.write(id);writer.write("\" ");
272                            for (ColumnDescription column : dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptionSet()) {
273                                    if (nt.hasLabel(column.getSymbolTable())) {
274                                            writer.write(column.getName().toLowerCase());
275                                            writer.write("=");
276                                            writer.write("\"");
277                                            writer.write(Util.xmlEscape(nt.getLabelSymbol(column.getSymbolTable())));
278                                            writer.write("\" ");
279                                    }
280                            }
281                            writer.write(">\n");
282                            
283                            for (int i = 0, n = nt.nChildren(); i < n; i++) {
284                                    PhraseStructureNode child = nt.getChild(i); 
285                                    writer.write("            <edge ");
286    
287                                    for (ColumnDescription column : dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptionSet()) {
288                                            if (child.hasParentEdgeLabel(column.getSymbolTable())) {
289                                                    writer.write(column.getName().toLowerCase());
290                                                    writer.write("=\"");
291                                                    writer.write(Util.xmlEscape(child.getParentEdgeLabelSymbol(column.getSymbolTable())));
292                                                    writer.write("\" ");
293                                            }
294                                    }
295                                    if (child instanceof TokenNode) {
296                                            if (!labeledTerminalID) {
297                                                    tmpID.setLength(0);
298                                                    tmpID.append(sentenceID);
299                                                    tmpID.append('_');
300                                                    tmpID.append(Integer.toString(child.getIndex()));
301                                                    writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
302                                            } else {
303                                                    writer.write(" idref=\"");writer.write(child.getLabelSymbol(dataFormatInstance.getInputSymbolTables().get("ID")));writer.write("\"");
304                                            }
305                                            
306                                    } else {
307                                            tmpID.setLength(0);
308                                            tmpID.append(sentenceID);
309                                            tmpID.append('_');
310                                            tmpID.append(Integer.toString(child.getIndex()+START_ID_OF_NONTERMINALS-1));
311                                            writer.write(" idref=\"");writer.write(tmpID.toString());writer.write("\"");
312                                    }
313                                    writer.write(" />\n");
314                            }
315                            writer.write("          </nt>\n");
316                    } catch (IOException e) {
317                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
318                    }
319            }
320    
321            
322            private void writeTail() throws MaltChainedException {
323                    try {
324                            writer.write("  </body>\n");
325                            writer.write("</corpus>\n");
326                            writer.flush();
327    //                      if (fileName != null && charsetName != null) {
328    //                              writer.close();
329    //                              writer = null;
330    //                              BufferedWriter headerWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName+".header"),charsetName));
331    //                              if (header == null) {
332    //                                      header = new TigerXMLHeader(dataFormatInstance.getSymbolTables());
333    //                              }
334    //                              
335    //                              headerWriter.write(header.toTigerXML());
336    //                              headerWriter.flush();
337    //                              headerWriter.close();
338    //                      }
339                    } catch (IOException e) {
340                            throw new DataFormatException("The TigerXML writer is not able to write. ", e);
341                    }
342            }
343            
344            public int getSentenceCount() {
345                    return sentenceCount;
346            }
347    
348            public void setSentenceCount(int sentenceCount) {
349                    this.sentenceCount = sentenceCount;
350            }
351            
352            public DataFormatInstance getDataFormatInstance() {
353                    return dataFormatInstance;
354            }
355    
356            public void setDataFormatInstance(DataFormatInstance dataFormatInstance) {
357                    this.dataFormatInstance = dataFormatInstance;
358                    labeledTerminalID = (dataFormatInstance.getInputColumnDescriptions().containsKey("id") || dataFormatInstance.getInputColumnDescriptions().containsKey("ID"));
359            }
360    
361            public String getOptions() {
362                    return optionString;
363            }
364            
365            public void setOptions(String optionString) throws MaltChainedException {
366                    this.optionString = optionString;
367                    rootHandling = RootHandling.NORMAL;
368    
369                    String[] argv;
370                    try {
371                            argv = optionString.split("[_\\p{Blank}]");
372                    } catch (PatternSyntaxException e) {
373                            throw new DataFormatException("Could not split the TigerXML writer option '"+optionString+"'. ", e);
374                    }
375                    for (int i=0; i < argv.length-1; i++) {
376                            if(argv[i].charAt(0) != '-') {
377                                    throw new DataFormatException("The argument flag should start with the following character '-', not with "+argv[i].charAt(0));
378                            }
379                            if(++i>=argv.length) {
380                                    throw new DataFormatException("The last argument does not have any value. ");
381                            }
382                            switch(argv[i-1].charAt(1)) {
383                            case 'r': 
384                                    if (argv[i].equals("n")) {
385                                            rootHandling = RootHandling.NORMAL;
386                                    } else if (argv[i].equals("tal")) {
387                                            rootHandling = RootHandling.TALBANKEN;
388                                    }
389                                    break;
390                            case 's': 
391                                    try {
392                                            START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]);
393                                    } catch (NumberFormatException e){
394                                            throw new MaltChainedException("The TigerXML writer option -s must be an integer value. ");
395                                    }
396                                    break;
397                            case 'v': 
398                                    VROOT_SYMBOL = argv[i];
399                                    break;  
400                            default:
401                                    throw new LibsvmException("Unknown TigerXML writer option: '"+argv[i-1]+"' with value '"+argv[i]+"'. ");                
402                            }
403                    }       
404            }
405    }