shellcode.cpp

#include "pin.H"
#include <iostream>
#include <fstream>
#include <iomanip>
#include <set>
#include <list>
#include <sstream>

/**
* Specifies the maximum number of legit instructions the plugin keeps track of
* before control flow is transferred to the shellcode.
**/
const unsigned int MAX_LEGIT_INSTRUCTION_LOG_SIZE = 100;

/**
* Keeps track of legit instructions before control flow is transferred to she
* shellcode.
**/
std::list<std::string> legitInstructions;

/**
* Keeps track of disassembled instructions that were already dumped.
**/
std::set<std::string*> dumped;

/**
* Output file the shellcode information is dumped to.
**/
std::ofstream traceFile;

/**
* Command line option to specify the name of the output file.
* Default is shellcode.out.
**/
KNOB<string> outputFile(KNOB_MODE_WRITEONCE, "pintool", "o", "shellcode.out", "specify trace file name");

/**
* Prints usage information.
**/
INT32 usage()
{
    cerr << "This tool produces a call trace." << endl << endl;
    cerr << KNOB_BASE::StringKnobSummary() << endl;
    return -1;
}

/**
* Determines whether a given address belongs to a known module or not.
**/
bool isUnknownAddress(ADDRINT address)
{
	// An address belongs to a known module, if the address belongs to any
	// section of any module in the target address space.

	for(IMG img=APP_ImgHead(); IMG_Valid(img); img = IMG_Next(img))
	{
		for(SEC sec=IMG_SecHead(img); SEC_Valid(sec); sec = SEC_Next(sec))
		{
			if (address >= SEC_Address(sec) && address < SEC_Address(sec) + SEC_Size(sec))
			{
				return false;
			}
		}
	}

	return true;
}

/**
* Given a fully qualified path to a file, this function extracts the raw
* filename and gets rid of the path.
**/
std::string extractFilename(const std::string& filename)
{
	unsigned int lastBackslash = filename.rfind("\\");

	if (lastBackslash == -1)
	{
		return filename;
	}
	else
	{
		return filename.substr(lastBackslash + 1);
	}
}

/**
* Given an address, this function determines the name of the loaded module the
* address belongs to. If the address does not belong to any module, the empty
* string is returned.
**/
std::string getModule(ADDRINT address)
{
	// To find the module name of an address, iterate over all sections of all
	// modules until a section is found that contains the address.

	for(IMG img=APP_ImgHead(); IMG_Valid(img); img = IMG_Next(img))
	{
		for(SEC sec=IMG_SecHead(img); SEC_Valid(sec); sec = SEC_Next(sec))
		{
			if (address >= SEC_Address(sec) && address < SEC_Address(sec) + SEC_Size(sec))
			{
				return extractFilename(IMG_Name(img));
			}
		}
	}

	return "";
}

/**
* Converts a PIN instruction object into a disassembled string.
**/
std::string dumpInstruction(INS ins)
{
	std::stringstream ss;

	ADDRINT address = INS_Address(ins);

	// Generate address and module information
	ss << "0x" << setfill('0') << setw(8) << uppercase << hex << address << "::" << getModule(address) << "  ";

	// Generate instruction byte encoding
	for (int i=0;i<INS_Size(ins);i++)
	{
		ss << setfill('0') << setw(2) << (((unsigned int) *(unsigned char*)(address + i)) & 0xFF) << " ";
	}

	for (int i=INS_Size(ins);i<8;i++)
	{
		ss << "   ";
	}

	// Generate diassembled string
	ss << INS_Disassemble(ins);
	
	// Look up call information for direct calls
	if (INS_IsCall(ins) && INS_IsDirectBranchOrCall(ins))
	{
		ss << " -> " << RTN_FindNameByAddress(INS_DirectBranchOrCallTargetAddress(ins));
	}

	return ss.str();
}

/**
* Callback function that is executed every time an instruction identified as
* potential shellcode is executed.
**/
void dump_shellcode(std::string* instructionString)
{
	if (dumped.find(instructionString) != dumped.end())
	{
		// This check makes sure that an instruction is not dumped twice.
		// For a complete run trace it would make sense to dump an instruction
		// every time it is executed. However, imagine the shellcode has a
		// tight loop that is executed a million times. The resulting log file
		// is much easier to read if every instruction is only dumped once.

		return;
	}

	if (!legitInstructions.empty())
	{
		// If legit instructions have been logged before the shellcode is
		// executed, it is now a good time to dump them to the file. This
		// information then shows when control flow was transferred from
		// legit code to shellcode.

		traceFile << "Executed before" << endl;

		for (std::list<std::string>::iterator Iter = legitInstructions.begin(); Iter != legitInstructions.end(); ++Iter)
		{
			traceFile << *Iter << endl;
		}

		traceFile << endl << "Shellcode:" << endl;

		legitInstructions.clear();
	}

	traceFile << *instructionString << std::endl;

	dumped.insert(instructionString);
}

/**
* This function is called
**/
void traceInst(INS ins, VOID*)
{
	ADDRINT address = INS_Address(ins);

	if (isUnknownAddress(address))
	{
		// The address is an address that does not belong to any loaded module.
		// This is potential shellcode. For these instructions a callback
		// function is inserted that dumps information to the trace file when
		// the instruction is actually executed.

		INS_InsertCall(ins, IPOINT_BEFORE, AFUNPTR(dump_shellcode),
			IARG_PTR, new std::string(dumpInstruction(ins)), IARG_END
		);
	}
	else
	{
		// The address is a legit address, meaning it is probably not part of
		// any shellcode. In this case we just log the instruction to dump it
		// later to show when control flow was transfered from legit code to
		// shellcode.

		legitInstructions.push_back(dumpInstruction(ins));

		if (legitInstructions.size() > MAX_LEGIT_INSTRUCTION_LOG_SIZE)
		{
			// Log only up to MAX_LEGIT_INSTRUCTION_LOG_SIZE instructions or the whole
			// program before the shellcode will be dumped.

			legitInstructions.pop_front();
		}
	}
}

/**
* Finalizer function that is called at the end of the trace process.
* In this script, the finalizer function is responsible for closing
* the shellcode output file.
**/
VOID fini(INT32, VOID*)
{
    traceFile.close();
}

int main(int argc, char *argv[])
{
    PIN_InitSymbols();

    if( PIN_Init(argc, argv))
    {
        return usage();
    }

    traceFile.open(outputFile.Value().c_str());

    string trace_header = string("#\n"
                                 "# Shellcode detector\n"
                                 "#\n");
    

    traceFile.write(trace_header.c_str(), trace_header.size());
    
    INS_AddInstrumentFunction(traceInst, 0);
    PIN_AddFiniFunction(fini, 0);

    // Never returns
    PIN_StartProgram();
    
    return 0;
}