LibPDF: Do not crash on encrypted files that start unluckily

PDF files can be linearized. In that case, they start with a "linearization dict" that stores the key `/Linearized` and the value `1`. To check if a file is linearized, we just read the first dict, and then checked if it has that key. If the first object of a PDF was a stream with a compression filter and the input PDF was encrypted and not linearized, then us trying to decode the linearization dict could crash due to stream contents being encrypted, decryption state not yet being initialized, and us trying to decompress stream data before decrypting it. To prevent this, disable uncompression when parsing the first object to determine if it's a lineralization dictionary. (A linearization dict never stores string values, so decryption not yet being initialized is not a problem. Integer values aren't encrypted in encrypted PDF files.)
2025-01-23 09:51:57 -05:00 · 2023-07-10 15:49:48 -04:00 · 2023-07-10 15:49:48 -04:00 · 39b2eed3f6
commit 39b2eed3f6
parent c781686198
3 changed files with 19 additions and 1 deletions
--- a/Userland/Libraries/LibPDF/DocumentParser.cpp
+++ b/Userland/Libraries/LibPDF/DocumentParser.cpp
@ -124,7 +124,19 @@ PDFErrorOr<DocumentParser::LinearizationResult> DocumentParser::initialize_linea
        return LinearizationResult::NotLinearized;

    // At this point, we still don't know for sure if we are dealing with a valid object.
+
+    // The linearization dict is read before decryption state is initialized.
+    // A linearization dict only contains numbers, so the decryption dictionary is not been needed (only strings and streams get decrypted, and only streams get unfiltered).
+    // But we don't know if the first object is a linearization dictionary until after parsing it, so the object might be a stream.
+    // If that stream is encrypted and filtered, we'd try to unfilter it while it's still encrypted, handing encrypted data to the unfiltering algorithms.
+    // This makes them assert, since they can't make sense of the encrypted data.
+    // So read the first object without unfiltering.
+    // If it is a linearization dict, there's no stream data and this has no effect.
+    // If it is a stream, this isn't a linearized file and the object will be read on demand (and unfiltered) later, when the object is lazily read via an xref entry.
+    set_filters_enabled(false);
    auto indirect_value_or_error = parse_indirect_value();
+    set_filters_enabled(true);
+
    if (indirect_value_or_error.is_error())
        return LinearizationResult::NotLinearized;

--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@ -474,7 +474,7 @@ PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictO
    if (m_document->security_handler() && m_enable_encryption)
        m_document->security_handler()->decrypt(stream_object, m_current_reference_stack.last());

-    if (dict->contains(CommonNames::Filter)) {
+    if (dict->contains(CommonNames::Filter) && m_enable_filters) {
        Vector<DeprecatedFlyString> filters;

        // We may either get a single filter or an array of cascading filters
--- a/Userland/Libraries/LibPDF/Parser.h
+++ b/Userland/Libraries/LibPDF/Parser.h
@ -57,6 +57,11 @@ public:
    PDFErrorOr<NonnullRefPtr<StreamObject>> parse_stream(NonnullRefPtr<DictObject> dict);
    PDFErrorOr<Vector<Operator>> parse_operators();

+    void set_filters_enabled(bool enabled)
+    {
+        m_enable_filters = enabled;
+    }
+
 protected:
    void push_reference(Reference const& ref) { m_current_reference_stack.append(ref); }
    void pop_reference() { m_current_reference_stack.take_last(); }
@ -73,6 +78,7 @@ protected:
    WeakPtr<Document> m_document;
    Vector<Reference> m_current_reference_stack;
    bool m_enable_encryption { true };
+    bool m_enable_filters { false };
 };

 };