空指针3月公开赛write up

做了下空指针公开赛的逆向题,做到三分之一左右遇到知识盲区就做不下去了,我太菜了。。。后面看了下官方wp才解决了知识盲区。。。

一打开套了个UPX壳,直接脱掉。

主逻辑在sub_123413B0

稍微看一下就能发现一些花指令,模式是

1
2
3
4
5
6
call $+9
db
jmp $+9
db
add dword ptr [esp], 1
retn

还有

1
2
3
4
5
jmp $+5
db
retn
db
call $-2

这部分的去花脚本如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def next_instr(addr):
return addr+ItemSize(addr)

def patch_nop(begin,end):
while(end>begin):
PatchByte(begin,0x90)
begin=begin+1
MakeCode(begin)

def undefine(addr, end):
while addr<end:
MakeUnkn(addr, 0)
addr += 1

def dejunk(addr, end):
while addr < end:
MakeCode(addr)
next = next_instr(addr)
if GetMnem(addr) == 'call':
if GetOperandValue(addr, 0) == addr + 9:
print("call jmp ret: %08x"%addr)
next = addr + 15
patch_nop(addr, next)
addr = next
continue
elif GetOperandValue(addr, 0) == addr + 6 and GetDisasm(addr + 6) == "add esp, 4":
print("call add esp: %08x"%addr)
next = addr + 9
patch_nop(addr, next)
addr = next
continue
if GetMnem(addr) == 'jmp' and GetOperandValue(addr, 0) == addr + 5 and Byte(addr+3) == 0xC3:
print('jmp ret call: %08x'%addr)
next = addr + 10
patch_nop(addr, next)
addr = next
continue
addr = next


dejunk(0x123413B0, 0x1234165D)
undefine(0x123413B0, 0x1234165D)
MakeCode(0x123413B0)

去除后:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int sub_123413B0()
{
int v0; // eax
char *input; // [esp+6Ch] [ebp-Ch]

show(0);
input = (char *)malloc(strlen("npointer{") + strlen("}") + 33);
printf("Input the correct keys: ");
scanf("%s", input);
if ( !memcmp(input, "npointer{", strlen("npointer{"))
&& input[strlen(input) - 1] == asc_12343738[0]// }
&& (input[strlen(input) - 1] = 0, check((int)&code, 0x30D3u), v0) )
{
show(1u);
printf("Congrats!\n");
}
else
{
printf("Sorry, the gate remains closed.\n");
}
return system("pause");
}

逻辑很简单,主要看check函数

check依然加了花,结尾多了一种花指令

1
2
3
call $+6
db
add esp, 4

去除后的check函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int __cdecl check(int code, SIZE_T size, int flag)
{
HANDLE v4; // eax
int v5; // ST38_4
LPVOID Dst; // [esp+1Ch] [ebp-14h]

Dst = VirtualAlloc(0, size, 0x3000u, 0x40u);
if ( !Dst )
return 0;
memcpy(Dst, (const void *)code, size);
v4 = GetCurrentProcess();
FlushInstructionCache(v4, Dst, size);
check_env();
v5 = ((int (__fastcall *)(_DWORD, int))Dst)(0, flag);
VirtualFree(Dst, 0, 0x8000u);
return v5;
}

分配内存,拷贝code。简单看下check_env

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
signed int anti_debug()
{
unsigned int i; // [esp+10h] [ebp-1Ch]
char *PBuffer; // [esp+14h] [ebp-18h]
size_t PBufferSizeInBytes; // [esp+18h] [ebp-14h]
char VarName[4]; // [esp+1Ch] [ebp-10h]
int v5; // [esp+20h] [ebp-Ch]
__int16 v6; // [esp+24h] [ebp-8h]
char v7; // [esp+26h] [ebp-6h]

*(_DWORD *)VarName = 0;
v5 = 0;
v6 = 0;
v7 = 0;
for ( i = 0; i < 0xB; ++i )
VarName[i] = ~byte_1234686C[i];
if ( dupenv_s(&PBuffer, &PBufferSizeInBytes, VarName) )
return 0;
if ( !PBuffer )
return 0;
free(PBuffer);
return 1;
}

这里的1234686c取反出来是WINDBG_DIR,一般反调试不会只加个windbg进黑名单(因为IDA和x86dbg用的比较多,至少我是这样的。。。),而且这种反调试非常明显。其实这里算是一个hint吧

所以最重要的部分在code里面,也就是12343798

在跳转到这里之前,还给一些寄存器赋了值:

1
2
3
4
# ebx: length of input
# ecx: idx, initialize to zero
# edx: input
# edi: base_addr

分析一下12343798开始的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
.rdata:12343798                 cmp     ecx, ebx
.rdata:1234379A jge short loc_123437A3
.rdata:1234379C mov al, [edx+ecx]
.rdata:1234379F cmp al, 66h
.rdata:123437A1 jz short loc_123437A6
.rdata:123437A3
.rdata:123437A3 loc_123437A3: ; CODE XREF: .rdata:1234379A↑j
.rdata:123437A3 xor eax, eax
.rdata:123437A5 retn
.rdata:123437A6 ; -----------------------------------------------------------------
.rdata:123437A6
.rdata:123437A6 loc_123437A6: ; CODE XREF: .rdata:123437A1↑j
.rdata:123437A6 push 75h
.rdata:123437A8 xor dword ptr [esp], 46h
.rdata:123437AC call loc_123437B2
.rdata:123437AC ; -----------------------------------------------------------------
.rdata:123437B1 db 8Eh
.rdata:123437B2 ; -----------------------------------------------------------------
.rdata:123437B2
.rdata:123437B2 loc_123437B2: ; CODE XREF: .rdata:123437AC↑j
.rdata:123437B2 mov dword ptr [esp], 49Eh
.rdata:123437B9 add [esp], edi
.rdata:123437BC inc ecx
.rdata:123437BD retf

首先对比第一位需要为f,接下来的代码比较关键:

1
2
3
4
5
6
7
8
push	75h
xor dword ptr [esp], 46h
db
call $+6
mov dword ptr [esp], 49Eh
add [esp], edi
inc ecx
retf

这里其实也是花指令,稍作分析这段代码等价于:

1
2
3
4
5
push	33h
push 49Eh ; 下面这两句相当于push (edi+49eh)
add [esp], edi
inc ecx
retf

自认为我的汇编基本功还是可以的,最早是看的王爽的《汇编语言》这本书,从8086学过来,所以还认得retf这条指令。这时在16位以下才用得到的,它相当于:

1
2
pop		ip
pop cs

也就是在16位汇编下跳转到cs:ip地址处。

然而这里是32位,据我所知32位以上段寄存器不再被用来指向段了。那么被用来做什么呢,这我倒是没想过。然后就因为这个知识盲区卡住做不下去了QAQ

强行往下看,下面的代码十分的奇怪,基本看不出什么意义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
.rdata:123437BE                 dec     eax
.rdata:123437BF cmp ecx, ebx
.rdata:123437C1 jge short loc_123437D6
.rdata:123437C3 mov al, [si]
.rdata:123437C6 or bh, [ebx+esi]
.rdata:123437C9 jz short loc_12343823
.rdata:123437CB call near ptr sub_123437D2
.rdata:123437D0 jno short near ptr loc_123437B9+1
.rdata:123437D2
.rdata:123437D2 ; =============== S U B R O U T I N E =======================================
.rdata:123437D2
.rdata:123437D2
.rdata:123437D2 sub_123437D2 proc far ; CODE XREF: .rdata:123437CB↑p
.rdata:123437D2
.rdata:123437D2 var_8 = dword ptr -8
.rdata:123437D2 arg_0 = dword ptr 8
.rdata:123437D2
.rdata:123437D2 dec eax
.rdata:123437D3 add esp, 8
.rdata:123437D6
.rdata:123437D6 loc_123437D6: ; CODE XREF: .rdata:123437C1↑j
.rdata:123437D6 dec eax
.rdata:123437D7 xor eax, eax
.rdata:123437D9 call loc_123437DF

没办法,最后尝试调试。用x86dbg调试到前面都没问题,retf后发现先断到ntdll中,然后莫名来到0x20511这里了(这里VirtualAlloc分配的基地址是0x20000,即源代码中的12343CA9,又是莫名其妙。

我一度认为这里存在代码自修改,因此找了挺久有没有漏掉的地方,然后一无所获就放弃了

等到官方wp放出来才恍然大悟。

其实如果做题的时候搜索一下关键字cs 0x33就会有所收获,可能做题不在状态,或者还是菜。。。

原来cs 0x33的作用是把windows进程从32位切换为64位。

我们知道64位windows运行32位程序是在WoW64上的(这是Windows 32-bit on Windows 64-bit的缩写,所以WoW64才是32位)。WoW64既然是Windows的子系统,就应该可以在WoW64上运行64位的代码。cs 0x33就是干这个的。运行32位代码的时候是cs 0x23,64位则是cs 0x33,只要改变cs为33就能够实现切换,本题中这段代码就是干这个事的。

懂了这点后就能继续往下做了。那么根据逻辑接下来来到base+0x49e处。

这里就很蛋疼了,由于我们使用ida32打开这个程序的,所以暂时没办法在ida中直接看64位代码。我们借助capstone这个逆向神器,直接在ida的python中看看这些代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from capstone import *
md_64 = Cs(CS_ARCH_X86, CS_MODE_64)
md_32 = Cs(CS_ARCH_X86, CS_MODE_32)


def get_string(addr, len):
res = ""
for i in range(len):
res += chr(Byte(addr + i))
return res



def d64(addr, len=20):
global md_64
print("========================================================")
base = 0
code_x64 = get_string(addr, 0x200)
j = 0
for i in md_64.disasm(code_x64, addr):
if j >= len:
break
j += 1
print("0x%x:\t%s\t%s" %(i.address, i.mnemonic, i.op_str))
print("========================================================")


def d32(addr, len=20):
global md_32
print("========================================================")
code_x86 = get_string(addr, 0x200)
j = 0
for i in md_32.disasm(code_x86, addr):
if j >= len:
break
j += 1
print("0x%x:\t%s\t%s" %(i.address, i.mnemonic, i.op_str))
print("========================================================")

然后就能凑活着反编译64位代码看了。

更简单的办法就是,把code复制进新的文件,赋值两份,一份用ida打开,一份用ida64打开,对比着看。这样我们就能用ida的很多操作了。

然后顺便去一下花,简单分析花的特点都是向后call 1-3字节然后,我们只需要把中间的垃圾值nop掉就行了:

复制到新的文件就不用再计算base了,也舒服了一些。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
with open("gatesXgame_d.exe", "rb") as f:
raw = f.read()
code = raw[0x2198:0x2198+0x30d3]
i = 0
while i<len(code)-5:
if code[i:i+5] == b"\xe8\x01\x00\x00\x00":
code = code[:i+5]+b"\x90" + code[i+6:]
i+=6
elif code[i:i+5] == b"\xe8\x02\x00\x00\x00":
code = code[:i+5]+b"\x90"*2 + code[i+7:]
i+=7
elif code[i:i+5] == b"\xe8\x03\x00\x00\x00":
code = code[:i+5]+b"\x90"*3 + code[i+8:]
i+=8
else:
i+=1

with open("code", "wb") as f:
f.write(code)

接下来可以分析代码了。首先地址0的32位代码,判断第一个为f,然后切换到64位,跳到49e,根据第二位为0或4,跳到下一个32位代码块,然后又。。。

每个代码块都会进行一个以上的判断,然后调到下一个代码块(都不对会返回0)。每个代码块之间的切换必定会切一次换位数。最终要去到返回1的地方。搜一下retn这条指令,在30d0(其实就是最后啦)。

很显然这是个图算法。代码量很大,所以需要写脚本提取出图的内容。由于每次节点的切换都会切换位数,所以大大降低了写脚本的难度。我们只需要根据一些特征提取数据就行了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from capstone import *
from struct import *
md_64 = Cs(CS_ARCH_X86, CS_MODE_64)
md_32 = Cs(CS_ARCH_X86, CS_MODE_32)
nodes = {}
with open("code", "rb") as f:
code = f.read()


def dis32(addr):
global code, md_32
tmp = code[addr:addr + 10]
return md_32.disasm(tmp, addr).__next__()

def dis64(addr):
global code, md_64
tmp = code[addr:addr + 10]
return md_64.disasm(tmp, addr).__next__()

def exp32(addr):
now = dis32(addr)
assert now.bytes == b'9\xd9'
addr += now.size
now = dis32(addr)
ret_addr = int(now.op_str, 16)
ret = dis32(ret_addr)
assert ret.bytes == b'1\xc0'

addr += now.size # mov al, byte ptr [edx + ecx]
now = dis32(addr)
assert now.bytes == b'\x8a\x04\n'
node = []
addr += now.size
while True:
if addr == ret_addr:
break
now = dis32(addr)
assert now.mnemonic == 'cmp'
next_ch = chr(int(now.op_str[4:], 16))

addr += now.size
now = dis32(addr)
jz_addr = int(now.op_str, 16)
next_addr = unpack("<I", code[jz_addr+15:jz_addr+15+4])[0]
next = [next_addr, next_ch]
node.append(next)
while True:
addr += now.size
now = dis32(addr)
if now.mnemonic == 'cmp' or addr == ret_addr:
break
return node

def exp64(addr):
now = dis64(addr)
assert now.bytes == b'H9\xd9'
addr += now.size
now = dis64(addr)
ret_addr = int(now.op_str, 16)
ret = dis64(ret_addr)
assert ret.bytes == b'H1\xc0'

addr += now.size # mov al, byte ptr [edx + ecx]
now = dis64(addr)
assert now.bytes == b'g\x8a\x04\n'
node = []
addr += now.size
while True:
if addr == ret_addr:
break
now = dis64(addr)
assert now.mnemonic == 'cmp'
next_ch = chr(int(now.op_str[4:], 16))

addr += now.size
now = dis64(addr)

jz_addr = int(now.op_str, 16)
next_addr = unpack("<I", code[jz_addr+21:jz_addr+21+4])[0]
next = [next_addr, next_ch]
node.append(next)
while True:
addr += now.size
now = dis64(addr)
if now.mnemonic == 'cmp' or addr == ret_addr:
break
return node


def show(addr, node):
print("0x%04x ->"%addr, end=' ')
for i in node:
print('\''+i[1]+'\'', "0x%04x"%i[0], end=', ')
print()



def get_graph(addr, switch):
if addr == 0x30d0:
return
global nodes
if switch == 0:
node = exp32(addr)
else:
node = exp64(addr)
switch ^= 1
nodes[addr] = node
show(addr, node)

for i in node:
if i[0] not in nodes.keys():
get_graph(i[0], switch)

get_graph(0, 0)

然后我们拿到了所有100个节点,以及每个节点的“指针”

接下来就是图搜索算法了,需要找到一条从0到0x30d0的路径。观察一下图里存在环,所以肯定是要找最短路径。

1
2
3
4
5
6
7
8
9
10
11
def get_path(addr, flag, path):
if addr == 0x30d0:
print("found!")
print("npointer{"+flag+"}")
return
node = nodes[addr]
for i in node:
if i[0] not in path:
get_path(i[0], flag+i[1], path+[i[0]])

get_path(0, "", [])

一度被图搜索代码困扰了挺久,后面想通了其实很简单。。。