<html> <head> <base href="https://bugs.freedesktop.org/" /> </head> <body> <div> <a class="bz_bug_link bz_status_NEW " title="NEW --- - [TAGGEDPDF] Parse the Tagged-PDF document structure tree when present" href="https://bugs.freedesktop.org/show_bug.cgi?id=64815#c37">Comment # 37</a> on <a class="bz_bug_link bz_status_NEW " title="NEW --- - [TAGGEDPDF] Parse the Tagged-PDF document structure tree when present" href="https://bugs.freedesktop.org/show_bug.cgi?id=64815">bug 64815</a> from <a class="email" href="mailto:carlosgc@gnome.org" title="Carlos Garcia Campos <carlosgc@gnome.org>"> Carlos Garcia Campos</a> <pre>Comment on <a href="attachment.cgi?id=82288" name="attach_82288" title="[PATCH v6 03/10] Tagged-PDF: Implement parsing of StructElem objects">attachment 82288</a> <a href="attachment.cgi?id=82288&action=edit" title="[PATCH v6 03/10] Tagged-PDF: Implement parsing of StructElem objects">[details]</a> <a href='page.cgi?id=splinter.html&bug=64815&attachment=82288'>[review]</a> [PATCH v6 03/10] Tagged-PDF: Implement parsing of StructElem objects Review of <a href="attachment.cgi?id=82288" name="attach_82288" title="[PATCH v6 03/10] Tagged-PDF: Implement parsing of StructElem objects">attachment 82288</a> <a href="attachment.cgi?id=82288&action=edit" title="[PATCH v6 03/10] Tagged-PDF: Implement parsing of StructElem objects">[details]</a> <a href='page.cgi?id=splinter.html&bug=64815&attachment=82288'>[review]</a>: ----------------------------------------------------------------- I still don't see the point of having two structs heap allocated and all the API methods with 'if isFoo()' to use one or the other, but I'm not opposed either if you guys think it's better. ::: poppler/StructElement.cc @@ +83,5 @@ > + { StructElement::Formula, "Formula", elementTypeUndefined }, > + { StructElement::Form, "Form", elementTypeUndefined }, > + { StructElement::TOC, "TOC", elementTypeUndefined }, > + { StructElement::TOCI, "TOCI", elementTypeUndefined }, > +}; All this is part of the standard types defined for tagged pdf, right? If we move this to a follow up patch and can merge this with the previous one, containing only logical structure implementation. @@ +126,5 @@ > +//------------------------------------------------------------------------ > +// StructElement > +//------------------------------------------------------------------------ > + > +const Ref StructElement::InvalidRef = { -1, -1 }; This looks unused here @@ +176,5 @@ > + c(new ContentData(mcid)) > +{ > + assert(treeRoot); > + assert(parent); > + assert(c->mcid != InvalidMCID); Instead of creating the ContentData struct with an invalid mcid, and asserting here, I think we should ensure that StructElement is never created with an invalid mcid. I don't understand in which situation mcid can be InvalidMCID or why -1 can't be a valid MCID, the spec says it's an integer that uniquely identifies the marked content sequence. @@ +188,5 @@ > +{ > + assert(treeRoot); > + assert(parent); > + assert(c->ref.num >= 0); > + assert(c->ref.gen >= 0); Same here, this should never be called with an invalid ref and I think it's already ensured by the callers. @@ +226,5 @@ > + if (parent) > + return parent->getPageRef(); > + > + static const Ref invalidRef = { -1, -1 }; > + return invalidRef; I would make this boolean returning the Ref as an out parameter instead of using this invalidRef @@ +299,5 @@ > + obj.free(); > + > + // Object ID (optional), to be looked at the IDTree in the tree root. > + if (element->lookup("ID", &obj)->isString()) { > + s->id = new GooString(obj.getString()); It's probably simpler to do obj.getString()->copy(), but it's exactly the same. @@ +317,5 @@ > + obj.free(); > + > + // Element title (optional). > + if (element->lookup("T", &obj)->isString()) { > + s->title = new GooString(obj.getString()); Ditto. @@ +324,5 @@ > + > + // Language (optional). > + if (element->lookup("Lang", &obj)->isString()) { > + s->language = obj.getString(); > + obj.initNull(); // The StructElement takes ownership of the GooString I still don't understand why you are doing this here and not for the title for example. And I still think the proper way of doing this is adding Object::takeString() or something like that. Another way of avoid duplicating the string is to save the Object instead of getting the String contained. @@ +331,5 @@ > + > + // Alternative text (optional). > + if (element->lookup("Alt", &obj)->isString()) { > + s->altText = obj.getString(); > + obj.initNull(); // The StructElement takes ownership of the GooString Ditto. @@ +338,5 @@ > + > + // Expanded form of an abbreviation (optional). > + if (element->lookup("E", &obj)->isString()) { > + s->expandedAbbr = obj.getString(); > + obj.initNull(); // The StructElement takes ownership of the GooString Ditto. @@ +345,5 @@ > + > + // Actual text (optional). > + if (element->lookup("ActualText", &obj)->isString()) { > + s->actualText = obj.getString(); > + obj.initNull(); // The StructElement takes ownership of the GooString Ditto. @@ +378,5 @@ > + mcidObj.free(); > + return NULL; > + } > + > + child = new StructElement(mcidObj.getInt(), treeRoot, this); mcidObj.free() after this. @@ +383,5 @@ > + > + if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) { > + child->pageRef = pageRefObj; > + } else { > + pageRefObj.free(); I don't think you need the else, you can call .free unconditionally: if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) child->pageRef = pageRefObj; pageRefObj.free(); @@ +396,5 @@ > + > + if (childObj->dictLookupNF("Pg", &pageRefObj)->isRef()) { > + child->pageRef = pageRefObj; > + } else { > + pageRefObj.free(); Ditto. @@ +403,5 @@ > + error(errSyntaxError, -1, "Obj object is wrong type ({0:s})", refObj.getTypeName()); > + } > + refObj.free(); > + } else if (childObj->isDict()) { > + assert(ref->isRef()); Why is this required to be a valid ref when child object is a dict? ::: poppler/StructElement.h @@ +63,3 @@ > > + inline GBool isContent() const { return (type == MCID && c->mcid != InvalidMCID) || isObjectRef(); } > + inline GBool isObjectRef() const { return (type == OBJR && c->ref.num != -1 && c->ref.gen != -1); } I don't think we should consider the cases of invalid objects, isOk should return FALSE in all those cases @@ +63,5 @@ > > + inline GBool isContent() const { return (type == MCID && c->mcid != InvalidMCID) || isObjectRef(); } > + inline GBool isObjectRef() const { return (type == OBJR && c->ref.num != -1 && c->ref.gen != -1); } > + > + int getMCID() const { return type == MCID ? c->mcid : InvalidMCID; } This is not actually public API, so I think it's simpler to assume this is only going to be called when type == MCID, so the caller should ensure that by checking the type first. @@ +64,5 @@ > + inline GBool isContent() const { return (type == MCID && c->mcid != InvalidMCID) || isObjectRef(); } > + inline GBool isObjectRef() const { return (type == OBJR && c->ref.num != -1 && c->ref.gen != -1); } > + > + int getMCID() const { return type == MCID ? c->mcid : InvalidMCID; } > + Ref getObjectRef() const { return type == OBJR ? c->ref : InvalidRef; } Ditto. ::: poppler/StructTreeRoot.cc @@ +115,4 @@ > for (int i = 0; i < kids.arrayGetLength(); i++) { > Object obj, ref; > kids.arrayGetNF(i, &ref); > + assert(ref.isRef()); Why should this be a valid ref? Shouldn't we ignore invalid entries and continue parsing instead of crashing?</pre> </div> <hr> You are receiving this mail because: <ul> <li>You are on the CC list for the bug.</li> </ul> </body> </html>