[Libreoffice-commits] .: scratch/formatxml.cpp
Lubos Lunak
llunak at kemper.freedesktop.org
Tue Dec 21 11:19:06 PST 2010
scratch/formatxml.cpp | 271 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 271 insertions(+)
New commits:
commit de52799eed94584c81fac80e3c12271cbca7d10d
Author: LuboÅ¡ LuÅák <l.lunak at suse.cz>
Date: Tue Dec 21 20:18:21 2010 +0100
a tool that just reformats xml files to be readable
diff --git a/scratch/formatxml.cpp b/scratch/formatxml.cpp
new file mode 100644
index 0000000..d0c5f51
--- /dev/null
+++ b/scratch/formatxml.cpp
@@ -0,0 +1,271 @@
+/*****************************************************************
+
+Copyright (C) 2010 Lubos Lunak <l.lunak at suse.cz>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************/
+
+/*
+
+This is a tool that formats nicely an XML file (e.g. the .docx or .odt formats
+are basically everything in a single line, which is pain to analyze).
+Unlike 'xmllint --format', this works even if the XML is corrupt, and unlike
+xmllint's --recover it does not alter the XML itself in any way (or at least
+tries not to, complain if there's a problem).
+
+To compile (libQtCore from Qt4 is required, $QTDIR is the location, usually /usr):
+g++ -Wall -I$QTDIR/include/QtCore -I$QTDIR/include formatxml.cpp -lQtCore -L$QTDIR/lib -o formatxml
+
+The given file is written to stdout if it's redirected, otherwise it's written
+to file with .format.xml appended.
+
+*/
+
+#include <assert.h>
+#include <qfile.h>
+#include <qstack.h>
+#include <qstringlist.h>
+#include <qtextstream.h>
+#include <stdio.h>
+
+enum TokenType
+ {
+ Error, // parse error or whatever
+ OtherTag, // comments, <? ... ?>
+ OpeningTag,
+ ClosingTag,
+ StandaloneTag, // <foo/>
+ Text // whatever text outside of tags
+ };
+
+static QStringList readTokens( QTextStream& in )
+ {
+ QStringList ret;
+ while( !in.atEnd())
+ {
+ QChar c;
+ in >> c;
+ if( c == '\n' ) // strip line leading whitespace (otherwise keep it, may be empty text between tags)
+ {
+ in.skipWhiteSpace();
+ in >> c;
+ }
+ if( in.atEnd())
+ break;
+ if( c == '<' )
+ {
+ QString str = c;
+ while( !in.atEnd())
+ {
+ in >> c;
+ str.append( c );
+ if( c == '>' )
+ break;
+ }
+ ret.append( str );
+ }
+ else
+ {
+ QString str = c;
+ while( !in.atEnd())
+ {
+// if( c == '\n' )
+// break;
+ in >> c;
+ if( c == '<' || c == '>' )
+ {
+ in.seek( in.pos() - 1 ); // one char back
+ break;
+ }
+ str.append( c );
+ }
+ ret.append( str );
+ }
+ }
+ return ret;
+ }
+
+static QString tagName( const QString& token )
+ {
+ assert( token.length() >= 3 && token[ 0 ] == '<' );
+ int start = ( token[ 1 ] == '/' ? 2 : 1 );
+ int after = token.indexOf( ' ' );
+ if( after == -1 )
+ {
+ if( token[ token.length() - 2 ] == '/' )
+ after = token.length() - 2; // strip trailing />
+ else
+ after = token.length() - 1; // string trailing /
+ }
+ return token.mid( start, after - start );
+ }
+
+static TokenType analyzeToken( const QString& token )
+ {
+ if( token.isEmpty())
+ return Error;
+ if( token[ 0 ] == '<' )
+ {
+ if( token.length() >= 4 // <??>
+ && ( token[ 1 ] == '?' || token[ 1 ] == '!' ))
+ {
+ if( token[ token.length() - 1 ] == '>' && token[ 1 ] == token[ token.length() - 2 ] )
+ return OtherTag;
+ else
+ return Error;
+ }
+ if( token.length() >= 4 // <a/>
+ && token[ token.length() - 1 ] == '>' && token[ token.length() - 2 ] == '/' )
+ {
+ return StandaloneTag;
+ }
+ if( token.length() >= 4 // </a>
+ && token[ 1 ] == '/' && token[ token.length() - 1 ] == '>' )
+ {
+ return ClosingTag;
+ }
+ if( token.length() >= 3 // <a>
+ && token[ token.length() - 1 ] == '>' )
+ {
+ return OpeningTag;
+ }
+ return Error;
+ }
+ return Text;
+ }
+
+static QString indent( int size )
+ {
+ return QString().fill( ' ', size );
+ }
+
+static void ensureNewLine( QTextStream& out, bool* needNewLine )
+ {
+ if( *needNewLine )
+ {
+ out << endl;
+ *needNewLine = false;
+ }
+ }
+
+static bool format( QTextStream& in, QTextStream& out )
+ {
+#define INDENT indent( stack.size() * 2 )
+ QStack< QString > stack;
+ QStringList tokens = readTokens( in );
+ bool needNewLine = false;
+ while( !tokens.isEmpty())
+ {
+ QString token = tokens.takeFirst();
+#if 0
+ static const char* const types[] = { "Error", "Other", "Opening", "Closing", "Standalone", "Text" };
+ QTextStream( stderr ) << "TOKEN(" << types[ analyzeToken( token ) ] << "): " << token << endl;
+#endif
+ switch( analyzeToken( token ))
+ {
+ case OpeningTag:
+ ensureNewLine( out, &needNewLine );
+ out << INDENT << token;
+ needNewLine = true;
+ stack.push( tagName( token ));
+ break;
+ case ClosingTag:
+ {
+ QString tag = tagName( token );
+ if( stack.isEmpty())
+ {
+ ensureNewLine( out, &needNewLine );
+ out << "<!-- ERROR: missing opening tag -->" << endl;
+ }
+ else if( stack.top() != tag )
+ { // TODO or try to find it in the stack?
+ ensureNewLine( out, &needNewLine );
+ out << "<!-- ERROR: opening/closing tag mismatch -->" << endl;
+ }
+ else
+ {
+ stack.pop();
+ }
+ if( !needNewLine ) // not line continuation
+ out << INDENT;
+ out << token << endl;
+ needNewLine = false;
+ break;
+ }
+ case StandaloneTag:
+ ensureNewLine( out, &needNewLine );
+ out << INDENT << token << endl;
+ break;
+ case OtherTag:
+ ensureNewLine( out, &needNewLine );
+ out << INDENT << token << endl;
+ break;
+ case Text:
+ if( !needNewLine ) // not line continuation
+ out << INDENT;
+ out << token;
+ needNewLine = true;
+ break;
+ case Error:
+ ensureNewLine( out, &needNewLine );
+ out << "<!-- ERROR: cannot parse: " << token << "-->" << endl;
+ break;
+ }
+ }
+ if( needNewLine )
+ out << endl;
+ if( stack.size() == 0 )
+ return true;
+ out << "<!-- ERROR: missing closing tags -->" << endl;
+ return false;
+#undef INDENT
+ }
+
+int main( int argc, char* argv[] )
+ {
+ if( argc != 2 )
+ {
+ QTextStream( stderr ) << "Usage: " << argv[ 0 ] << " <file>" << endl;
+ return 2;
+ }
+ QFile fin( argv[ 1 ] );
+ if( !fin.open( QIODevice::ReadOnly ))
+ {
+ QTextStream( stderr ) << "File " << argv[ 1 ] << " cannot be read" << endl;
+ return 3;
+ }
+ QTextStream in( &fin );
+ QFile fout;
+ if( !isatty( 1 ))
+ fout.open( stdout, QIODevice::WriteOnly );
+ else
+ {
+ QString fname( QString( argv[ 1 ] ) + ".format.xml" );
+ fout.setFileName( fname );
+ if( !fout.open( QIODevice::WriteOnly ))
+ {
+ QTextStream( stderr ) << "Cannot write to " << fname << endl;
+ return 4;
+ }
+ }
+ QTextStream out( &fout );
+ in.setCodec( "UTF-8" );
+ out.setCodec( "UTF-8" );
+ return format( in, out ) ? 0 : 1;
+ }
More information about the Libreoffice-commits
mailing list