README.md
Rendering markdown...
# Original idea from Hitcon 2022 web2pdf challenge: https://blog.splitline.tw/hitcon-ctf-2022/#%F0%9F%93%83-web2pdf-web
# Example usage: python extract_pdf_images.py Ticket-831767.pdf -v
import fitz # PyMuPDF
from PIL import Image
import base64
import zlib
import re
from argparse import ArgumentParser
parser = ArgumentParser(description="Extract file contents embedded in bitmap images in PDF file. File content may be plaintext or base64/zlib compressed. Extracted bitmap images and file contents are written to disk")
parser.add_argument('file', help='PDF file path')
parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
args = parser.parse_args()
PDF_FILE_PATH = args.file
VERBOSE = args.verbose
def decompress(data: bytes, chunk_size: int = 1024) -> bytes:
"""best effort zlib decompress. Return empty if not zlib compressed."""
decompressor = zlib.decompressobj(wbits=-15)
decompressed_output = b''
for i in range(0, len(data), chunk_size):
chunk = data[i:i + chunk_size]
try:
decompressed_chunk = decompressor.decompress(chunk)
decompressed_output += decompressed_chunk
except zlib.error as e:
if VERBOSE:
print(f"Zlib error encountered at chunk {i}: {e}. Stopping decompression.")
# Return the successfully decompressed data up to this point
return decompressed_output + decompressor.flush()
# Return the full output if no error was encountered
return decompressed_output + decompressor.flush()
def decodeb64(encoded_data: bytes, min_b64_output_bytes: int = 12) -> bytes:
"""
Best-effort Base64 decode.
If the total successful Base64 decoded output is less than
'min_b64_output_bytes', the function falls back to cleaning the entire
original input as printable plain text.
"""
# 1. Clean and initialize
encoded_data = encoded_data.strip()
decoded_output = b""
block_size = 4
# 2. Iterative Base64 Decoding Attempt
for i in range(0, len(encoded_data), block_size):
block = encoded_data[i:i + block_size]
try:
# base64.b64decode directly accepts the bytes block
decoded_block = base64.b64decode(block, validate=True)
decoded_output += decoded_block
except base64.binascii.Error as e:
# --- Base64 Decode Failed: Trigger Fallback/Partial Logic ---
# Check the total size of successfully decoded output bytes so far
if len(decoded_output) < min_b64_output_bytes:
if VERBOSE:
print(f"B64 decode failed after only {len(decoded_output)} bytes. Falling back to plain text.")
# Fallback triggered: We use the entire original input data
return _clean_unprintable_bytes(encoded_data)
# If the partial decode meets or exceeds the minimum threshold (N),
# we treat it as best-effort B64 and return the partial binary data.
if VERBOSE:
print(f"B64 decode failed after {len(decoded_output)} bytes (>= {min_b64_output_bytes}). Returning partial output.")
return decoded_output
# --- Success Case ---
# Loop completed without error. Now check the total length of the full output.
if len(decoded_output) < min_b64_output_bytes:
if VERBOSE:
print(f"Full B64 decode resulted in only {len(decoded_output)} bytes (< {min_b64_output_bytes}). Falling back to plain text.")
# Fallback triggered: Even though it decoded fully, the output was too short.
return _clean_unprintable_bytes(encoded_data)
# Full successful decode that meets the length requirement.
return decoded_output
# --- Helper Function for Plain Text Cleaning ---
def _clean_unprintable_bytes(data: bytes) -> bytes:
"""Decodes bytes to string and removes all non-printable ASCII characters."""
# Convert the raw bytes to a string.
# Using 'ascii' and 'ignore' errors to handle non-ASCII bytes gracefully.
text_input = data.decode('ascii', errors='ignore')
# Regex to find and remove non-standard printable characters.
# [^\x20-\x7E\n\r\t] matches anything outside the ASCII printable range.
RE_UNPRINTABLE = re.compile(r'[^\x20-\x7E\n\r\t]')
cleaned_text = RE_UNPRINTABLE.sub('', text_input)
# Return the cleaned string (type is 'str')
return cleaned_text.encode()
def extract_data(filename):
try:
with open(filename, 'rb') as f:
data = f.read()
marker = b'\x1b$)C'
data = data.partition(marker)[2].replace(b'\x00', b'')
b64_decoded_data = decodeb64(data)
decompressed_data = decompress(b64_decoded_data)
if decompressed_data:
data = decompressed_data
elif b64_decoded_data:
data = b64_decoded_data
if VERBOSE:
print(data)
if data:
extracted_filename = filename + '.extracted'
with open(extracted_filename, 'wb') as f2:
f2.write(data)
print(f'Wrote extracted data to: {extracted_filename}')
except Exception as e:
print(f'Unexpected error extracting data: {e}')
try:
pdf_file = fitz.open(PDF_FILE_PATH)
for page_index in range(len(pdf_file)):
page = pdf_file[page_index]
image_list = page.get_images(full=True)
if not image_list:
continue
if VERBOSE:
print(f"Found {len(image_list)} images on page {page_index + 1}")
for image_index, img in enumerate(image_list, start=1):
xref = img[0]
try:
# Use PyMuPDF's pixmap to get the raw image data
pix = fitz.Pixmap(pdf_file, xref)
# Check for alpha channel and convert to RGB if present, as BMP doesn't support it
if pix.alpha:
pix = fitz.Pixmap(fitz.csRGB, pix)
# Get the raw pixel data from the pixmap
image_data = pix.samples
# Create a Pillow image object from the raw pixel data
pil_image = Image.frombytes("RGB", [pix.width, pix.height], image_data)
# Define the filename and save using Pillow in the BMP format
image_filename = f"page{page_index+1}_img{image_index}.bmp"
pil_image.save(image_filename, "BMP")
if VERBOSE:
print(f"Saved original image as BMP: {image_filename}")
extracted_data = extract_data(image_filename)
except Exception as e:
print(f"Error processing image {image_index} on page {page_index + 1}: {e}")
print()
pdf_file.close()
except Exception as e:
print(f"An error occurred: {e}")