#! /usr/bin/env python # -*- coding: utf-8 -*- # vi:ts=4:et import pycurl import re try: from io import BytesIO except ImportError: from StringIO import StringIO as BytesIO headers = {} def header_function(header_line): # HTTP standard specifies that headers are encoded in iso-8859-1. # On Python 2, decoding step can be skipped. # On Python 3, decoding step is required. header_line = header_line.decode('iso-8859-1') # Header lines include the first status line (HTTP/1.x ...). # We are going to ignore all lines that don't have a colon in them. # This will botch headers that are split on multiple lines... if ':' not in header_line: return # Break the header line into header name and value. name, value = header_line.split(':', 1) # Remove whitespace that may be present. # Header lines include the trailing newline, and there may be whitespace # around the colon. name = name.strip() value = value.strip() # Header names are case insensitive. # Lowercase name here. name = name.lower() # Now we can actually record the header name and value. headers[name] = value buffer = BytesIO() c = pycurl.Curl() c.setopt(c.URL, 'http://pycurl.io') c.setopt(c.WRITEFUNCTION, buffer.write) # Set our header function. c.setopt(c.HEADERFUNCTION, header_function) c.perform() c.close() # Figure out what encoding was sent with the response, if any. # Check against lowercased header name. encoding = None if 'content-type' in headers: content_type = headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: encoding = match.group(1) print('Decoding using %s' % encoding) if encoding is None: # Default encoding for HTML is iso-8859-1. # Other content types may have different default encoding, # or in case of binary data, may have no encoding at all. encoding = 'iso-8859-1' print('Assuming encoding is %s' % encoding) body = buffer.getvalue() # Decode using the encoding we figured out. print(body.decode(encoding))