diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4278fe4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +pybytecode.egg-info +dist +*.*~ +__pycache__ +*.pyc diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4125318 --- /dev/null +++ b/LICENSE @@ -0,0 +1,12 @@ +Copyright (c) 2014, Risto Stevcev +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..bb3ec5f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..7230e9f --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +# Python Byte-code Compiler + +This app provides the ability to convert Python files into their **.pyc** files. Python **.pyc** files are *compiled* Python files, compiled into byte-code. If you ever wondered why sometimes Python generates these and the ``__pycache__`` folder, it's for performance reasons. + +The purpose of this exercise is to expose the internals of Python so that some people might experiment with writing their own language that runs on the Python virtual machine. A lot of the more recent languages such as Scala and Clojure run on the JVM. They've become popular because they immediately come with *batteries included* so-to-speak, because they're capable of importing all existing Java libraries. Python is arguably a cleaner language than Java, and so it would be advantageous to have a functional language, for example, that integrates well with Python--a language that follows Pythonic principles (see ``import this``). I plan on working on such a language, but I'd like to open the flood gates for everyone else as well. + + +## Generating byte-code (*.pyc files*) + +The structure of **.pyc** files is as follows: + +1. *4 bytes*: **Magic number** +2. *4 bytes*: **Timestamp** +3. *4 bytes*: **Padding** +4. *N bytes*: **Marshalled code object** + +You can get each segment to create a **.pyc** file in the following ways: + +1. The **magic number** corresponds to the required Python version. You can get this number through the **imp** module: +``import imp`` +``magic_number = imp.get_magic()`` +2. The **timestamp** corresponds to the time it was created. If there's a corresponding **.py** file, it checks this timestamp with that file to see if they match. Otherwise it's irrelevant if the .pyc file is on its own. You can get this number by using the **time** and **struct** modules: +``import struct, time`` +``timestamp = struct.pack('i', int(time.time()))`` +3. The **padding** is just padding before the code object, basically 4-byte sequence of 0's. This padding seems to only be in Python 3, so **omit it for Python 2**. Sometimes the first byte has some value, but it doesn't seem relevant. You can just use this bytestring: +``b'A\x00\x00\x00'`` +4. The **code object** is a marshalled python code object. You can use the ``compile`` command to compile a segment of python code into a code object to test this out initially. The command signature is ``compile(code_segment, 'file_name', 'exec')``. You need to make sure that ``file_name`` corresponds to the filename you are writing the **.pyc** file into. Here's a simple example: +``import marshal`` +``filename = 'addnum.py'`` +``code_segment = 'a = 123 + 321\nprint(a)'`` +``code = compile(code_segment, filename, 'exec')`` +``marshalled_code_object = marshal.dumps(code)`` + + +You can put it all together like this: + + # write to addnum.pyc + with open(filename + 'c', 'wb') as f: + f.write(magic_number) + f.write(timestamp) + f.write(padding) + f.write(marshalled_code_object) + +And then you can test it out like a regular python file, it should work! + + $ python addnum.pyc + 444 + +You can test out the bytecode compiler by running ``python bytecode.py [.py file]`` or ``pybytecode [.py file]`` from the command-line. + + +## Writing code objects + +You can write Python objects by importing the CodeType type like this: ``from type import CodeType``. You can view the help for the required parameters (``help(CodeType)``), and there's quite a bit of documentation online about the different portions of the python code object. [Alberto's StackOverflow post](http://stackoverflow.com/questions/16064409/how-to-create-a-code-object-in-python) provides a fairly decent overview of each one. I've included his code as part of ``codegen.py``. See this README's Resources section for opcodes so you can start writing a byte-code compiler for your own language that can be read using the Python virtual machine! + +You can test out the code generator by running ``python codegen.py`` or ``pycodegen`` from the command-line. + + +## Resources + +* Python bytecode instructions and their descriptions can be found in the **dis** module [documentation](https://docs.python.org/2/library/dis.html#python-bytecode-instructions). + +* You can view all of the python opcodes from Python's source code in the [Include/opcode.h](https://github.com/python/cpython/blob/master/Include/opcode.h). + +* If in doubt, create a code object of the type of segment you need using ``code = compile(code_segment, my_file_name, 'exec')`` and then disassembling it using ``dis.dis(code)`` and then creating the bytecode by translating to the opcodes and the params that go with it (see codegen.py and opcode.h). + +* Though both of these aren't being maintained anymore, you might want to check out [PEAK](http://peak.telecommunity.com/DevCenter/BytecodeAssembler) and [Byteplay](http://code.google.com/p/byteplay/) for bytecode assembly. diff --git a/pybytecode/__init__.py b/pybytecode/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pybytecode/bytecode.py b/pybytecode/bytecode.py new file mode 100755 index 0000000..784bbc0 --- /dev/null +++ b/pybytecode/bytecode.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +import marshal +import struct +import time +import imp +import sys +import os + + +class Compiler(object): + def __init__(self, path): + with open(path, 'r') as f: + self.data = f.read() + self.filename = os.path.basename(path) + self.dirname = os.path.dirname(os.path.abspath(path)) + + self.magic_number = imp.get_magic() + self.modification_date = struct.pack('i', int(time.time())) + self.padding = b'A\x00\x00\x00' + + self.code = compile(self.data, self.filename, 'exec') + self.bytes_code = marshal.dumps(self.code) + + + def compile(self): + with open(os.path.join(self.dirname, self.filename + 'c'), 'wb') as f: + f.write(self.magic_number) + f.write(self.modification_date) + if sys.version_info.major == 3: + f.write(self.padding) + f.write(self.bytes_code) + + +def main(): + if len(sys.argv) != 2: + print('Usage: %s [python file]' % sys.argv[0]) + sys.exit(1) + + compiler = Compiler(sys.argv[1]) + compiler.compile() + + +if __name__ == '__main__': + main() diff --git a/pybytecode/codegen.py b/pybytecode/codegen.py new file mode 100755 index 0000000..12400dc --- /dev/null +++ b/pybytecode/codegen.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# # # # # # # # # # # # # +# Create a code object # +# # # # # # # # # # # # # +""" +>>> from types import CodeType +>>> help(CodeType) +Help on class code in module builtins: + +class code(object) + | code(argcount, kwonlyargcount, nlocals, stacksize, flags, codestring, + | constants, names, varnames, filename, name, firstlineno, + | lnotab[, freevars[, cellvars]]) + | + | Create a code object. Not for the faint of heart. +... + +""" +from types import CodeType +from dis import dis +import sys + +if sys.version_info.major < 3: + print("You need python 3 to run this code.") + sys.exit(1) + +co_code = bytes([101, 0, 0, #Load print function + 101, 1, 0, #Load name 'a' + 101, 2, 0, #Load name 'b' + 23, #Take first two stack elements and store their sum + 131, 1, 0, #Call first element in the stack with one positional argument + 1, #Pop top of stack + 101, 0, 0, #Load print function + 101, 1, 0, #Load name 'a' + 101, 2, 0, #Load name 'b' + 20, #Take first two stack elements and store their product + 131, 1, 0, #Call first element in the stack with one positional argument + 1, #Pop top of stack + 100, 0, 0, #Load constant None + 83]) #Return top of stack + +lnotab = bytes([14,1]) + +my_code = CodeType( + 0, + 0, + 0, + 3, + 64, + co_code, + (None,), + ('print', 'a', 'b'), + (), + 'my_code_filename', + 'my_code', + 1, + lnotab, + freevars=(), + cellvars=() ) + + +def main(): + a=2 + b=7 + + print('Print the sum and the product of a=%s and b=%s:' % (a, b)) + exec(my_code) # code prints the sum and the product of "a" and "b" + + print('\nDisassemble the code:') + dis(my_code) # disassemble the code + + +if __name__ == '__main__': + main() diff --git a/pybytecode/fibonacci.py b/pybytecode/fibonacci.py new file mode 100644 index 0000000..7255ae0 --- /dev/null +++ b/pybytecode/fibonacci.py @@ -0,0 +1,16 @@ +import sys + +def fibonacci(n): + if n == 0: + return 0 + elif n == 1: + return 1 + else: + return fibonacci(n - 1) + fibonacci(n - 2) + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('Usage: %s [length]' % sys.argv[0]) + sys.exit(1) + + print( fibonacci( int(sys.argv[1]) ) ) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b0cae3c --- /dev/null +++ b/setup.py @@ -0,0 +1,34 @@ +from setuptools import setup, find_packages + + +setup( + name='pybytecode', + version='0.1', + license='BSD', + author='Risto Stevcev', + author_email='risto1@gmail.com', + url='https://github.com/Risto-Stevcev/pybytecode', + description="A Python bytecode compiler and bytecode generator.", + long_description=open("README.md","r").read(), + packages=find_packages(), + entry_points = { + 'console_scripts': ['pybytecode=pybytecode.bytecode:main', + 'pycodegen=pybytecode.codegen:main'], + }, + keywords = "compiler bytecode python code", + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: BSD License', + 'Natural Language :: English', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 2.7', + 'Topic :: Software Development :: Assemblers', + 'Topic :: Software Development :: Code Generators', + 'Topic :: Software Development :: Compilers', + 'Topic :: Software Development :: Disassemblers', + ], +) +