Indent and add more comments for FreeBSD syscall code

After working on an issue that turns out to seem to be with the
FreeBSD kernel sched_uler I played a lot with the Valgrind
syscall and scheduler code. I've kept the comments and the
reformatting.
This commit is contained in:
Paul Floyd 2022-05-30 22:57:34 +02:00
parent 807ea10a5c
commit 2b370744e5
6 changed files with 232 additions and 207 deletions

View File

@ -735,6 +735,10 @@ asm(
);
#elif defined(VGP_amd64_freebsd)
/* Convert function calling convention --> SYSCALL_STD calling
convention
PJF - not sure why we don't use SYSCALL0 convention like x86
*/
extern UWord do_syscall_WRK (
UWord syscall_no, /* %rdi */
UWord a1, /* %rsi */
@ -751,8 +755,6 @@ extern UWord do_syscall_WRK (
asm(
".text\n"
"do_syscall_WRK:\n"
/* Convert function calling convention --> syscall calling
convention */
" pushq %rbp\n"
" movq %rsp, %rbp\n"
" movq %rdi, %rax\n" /* syscall_no */
@ -761,21 +763,21 @@ asm(
" movq %rcx, %rdx\n" /* a3 */
" movq %r8, %r10\n" /* a4 */
" movq %r9, %r8\n" /* a5 */
" movq 16(%rbp), %r9\n" /* a6 last arg from stack, account for %rbp */
" movq 16(%rbp), %r9\n" /* a6 last arg from stack, account for %rbp */
" movq 24(%rbp), %r11\n" /* a7 from stack */
" pushq %r11\n"
" movq 32(%rbp), %r11\n" /* a8 from stack */
" pushq %r11\n"
" subq $8,%rsp\n" /* fake return addr */
" subq $8,%rsp\n" /* fake return addr */
" syscall\n"
" jb 1f\n"
" movq 48(%rbp),%rsi\n"
" movq %rdx, (%rsi)\n"
" movq 48(%rbp),%rsi\n" /* success */
" movq %rdx, (%rsi)\n" /* second return value */
" movq %rbp, %rsp\n"
" popq %rbp\n"
" ret\n"
"1:\n"
" movq 40(%rbp), %rsi\n"
"1:\n" /* error path */
" movq 40(%rbp), %rsi\n" /* flags */
" movl $1,(%rsi)\n"
" movq %rbp, %rsp\n"
" popq %rbp\n"

View File

@ -77,115 +77,124 @@
.globl ML_(do_syscall_for_client_WRK)
ML_(do_syscall_for_client_WRK):
/* save callee-saved regs */
pushq %rbp
movq %rsp, %rbp
pushq %rdi // -8(%rbp) syscallno
pushq %rsi // -16(%rbp) guest_state
pushq %rdx // -24(%rbp) sysmask
pushq %rcx // -32(%rbp) postmask
pushq %r8 // -40(%rbp) sigsetSzB
/* save callee-saved regs */
pushq %rbp
movq %rsp, %rbp
pushq %rdi // -8(%rbp) syscallno
pushq %rsi // -16(%rbp) guest_state
pushq %rdx // -24(%rbp) sysmask
pushq %rcx // -32(%rbp) postmask
pushq %r8 // -40(%rbp) sigsetSzB
1: /* Even though we can't take a signal until the sigprocmask completes,
start the range early.
If eip is in the range [1,2), the syscall hasn't been started yet */
1: /* Even though we can't take a signal until the sigprocmask completes,
start the range early.
If eip is in the range [1,2), the syscall hasn't been started yet */
/* Set the signal mask which should be current during the syscall. */
/* Save and restore all 5 arg regs round the call. This is easier
than figuring out the minimal set to save/restore. */
/* Set the signal mask which should be current during the syscall. */
/* Save and restore all 5 arg regs round the call. This is easier
than figuring out the minimal set to save/restore. */
movq $__NR_sigprocmask, %rax // syscall #
movq $VKI_SIG_SETMASK, %rdi // how
movq %rdx, %rsi // sysmask
movq %rcx, %rdx // postmask
syscall
movq $__NR_sigprocmask, %rax // syscall #
movq $VKI_SIG_SETMASK, %rdi // how
movq %rdx, %rsi // sysmask
movq %rcx, %rdx // postmask
syscall
jb 7f /* sigprocmask failed */
jb 7f /* sigprocmask failed */
/* OK, that worked. Now do the syscall proper. */
/* OK, that worked. Now do the syscall proper. */
/* 6 register parameters */
movq -16(%rbp), %r11 /* r11 = VexGuestAMD64State * */
movq OFFSET_amd64_RDI(%r11), %rdi
movq OFFSET_amd64_RSI(%r11), %rsi
movq OFFSET_amd64_RDX(%r11), %rdx
movq OFFSET_amd64_R10(%r11), %r10
movq OFFSET_amd64_R8(%r11), %r8
movq OFFSET_amd64_R9(%r11), %r9
/* 6 register parameters */
movq -16(%rbp), %r11 /* r11 = VexGuestAMD64State * */
movq OFFSET_amd64_RDI(%r11), %rdi
movq OFFSET_amd64_RSI(%r11), %rsi
movq OFFSET_amd64_RDX(%r11), %rdx
movq OFFSET_amd64_R10(%r11), %r10
movq OFFSET_amd64_R8(%r11), %r8
movq OFFSET_amd64_R9(%r11), %r9
/* 2 stack parameters plus return address (ignored by syscall) */
movq OFFSET_amd64_RSP(%r11), %r11 /* r11 = simulated RSP */
movq 16(%r11), %rax
pushq %rax
movq 8(%r11), %rax
pushq %rax
/* (fake) return address. */
movq 0(%r11), %rax
pushq %rax
/* @todo PJF there is a potential bug here
* syscall can take up to 8 arguments
* but when syscall0 or syscall198 is being used
* one argument is used for the syscall0/198 id
* and one for the actual id and in this case
* there could be 3 stack parameters.
* However, only mmap takes 8 arguments
* and only on x86. It would be an unlikely combination,
* but this might break one day. */
movq OFFSET_amd64_RSP(%r11), %r11 /* r11 = simulated RSP */
movq 16(%r11), %rax
pushq %rax
movq 8(%r11), %rax
pushq %rax
/* (fake) return address. */
movq 0(%r11), %rax
pushq %rax
/* syscallno */
movq -8(%rbp), %rax
movq -8(%rbp), %rax
/* If rip==2, then the syscall was either just about
to start, or was interrupted and the kernel was
restarting it. */
2: syscall
3: /* In the range [3, 4), the syscall result is in %rax,
but hasn't been committed to RAX. */
/* If rip==2, then the syscall was either just about
to start, or was interrupted and the kernel was
restarting it. */
2: syscall
3: /* In the range [3, 4), the syscall result is in %rax,
but hasn't been committed to RAX. */
/* stack contents: 3 words for syscall above, plus our prologue */
setc 0(%rsp) /* stash returned carry flag */
/* stack contents: 3 words for syscall above, plus our prologue */
setc 0(%rsp) /* stash returned carry flag */
movq -16(%rbp), %r11 /* r11 = VexGuestAMD64State * */
movq %rax, OFFSET_amd64_RAX(%r11) /* save back to RAX */
movq %rdx, OFFSET_amd64_RDX(%r11) /* save back to RDX */
movq -16(%rbp), %r11 /* r11 = VexGuestAMD64State * */
movq %rax, OFFSET_amd64_RAX(%r11) /* save back to RAX */
movq %rdx, OFFSET_amd64_RDX(%r11) /* save back to RDX */
/* save carry flag to VEX */
xorq %rax, %rax
movb 0(%rsp), %al
movq %rax, %rdi /* arg1 = new flag */
movq %r11, %rsi /* arg2 = vex state */
addq $24, %rsp /* remove syscall parameters */
movq $0x1, ML_(blksys_saving_cflag)
call LibVEX_GuestAMD64_put_rflag_c
movq $0x0, ML_(blksys_saving_cflag)
/* save carry flag to VEX */
xorq %rax, %rax
movb 0(%rsp), %al
movq %rax, %rdi /* arg1 = new flag */
movq %r11, %rsi /* arg2 = vex state */
addq $24, %rsp /* remove syscall parameters */
movq $0x1, ML_(blksys_saving_cflag)
call LibVEX_GuestAMD64_put_rflag_c
movq $0x0, ML_(blksys_saving_cflag)
4: /* Re-block signals. If eip is in [4,5), then the syscall
is complete and we needn't worry about it. */
4: /* Re-block signals. If eip is in [4,5), then the syscall
is complete and we needn't worry about it. */
movq $__NR_sigprocmask, %rax // syscall #
movq $VKI_SIG_SETMASK, %rdi // how
movq -32(%rbp), %rsi // postmask
xorq %rdx, %rdx // NULL
syscall
movq $__NR_sigprocmask, %rax // syscall #
movq $VKI_SIG_SETMASK, %rdi // how
movq -32(%rbp), %rsi // postmask
xorq %rdx, %rdx // NULL
syscall
jb 7f /* sigprocmask failed */
jb 7f /* sigprocmask failed */
5: /* now safe from signals */
xorq %rax,%rax
movq -8(%rbp), %rdi
movq -16(%rbp), %rsi
movq -24(%rbp), %rdx
movq -32(%rbp), %rcx
movq -40(%rbp), %r8
movq %rbp, %rsp
popq %rbp
ret
xorq %rax,%rax
movq -8(%rbp), %rdi
movq -16(%rbp), %rsi
movq -24(%rbp), %rdx
movq -32(%rbp), %rcx
movq -40(%rbp), %r8
movq %rbp, %rsp
popq %rbp
ret
7: /* failure: return 0x8000 | error code */
orq $0x8000, %rax
movq -8(%rbp), %rdi
movq -16(%rbp), %rsi
movq -24(%rbp), %rdx
movq -32(%rbp), %rcx
movq -40(%rbp), %r8
movq %rbp, %rsp
popq %rbp
ret
orq $0x8000, %rax
movq -8(%rbp), %rdi
movq -16(%rbp), %rsi
movq -24(%rbp), %rdx
movq -32(%rbp), %rcx
movq -40(%rbp), %r8
movq %rbp, %rsp
popq %rbp
ret
.section .rodata
/* export the ranges so that
VG_(fixup_guest_state_after_syscall_interrupted) can do the
right thing */
/* export the ranges so that
VG_(fixup_guest_state_after_syscall_interrupted) can do the
right thing */
.globl ML_(blksys_setup)
.globl ML_(blksys_restart)

View File

@ -38,40 +38,40 @@
/*----------------------------------------------------------------*/
/*
Perform a syscall for the client. This will run a syscall
with the client's specific per-thread signal mask.
Perform a syscall for the client. This will run a syscall
with the client's specific per-thread signal mask.
The structure of this function is such that, if the syscall is
interrupted by a signal, we can determine exactly what
execution state we were in with respect to the execution of
the syscall by examining the value of %eip in the signal
handler. This means that we can always do the appropriate
thing to precisely emulate the kernel's signal/syscall
interactions.
The structure of this function is such that, if the syscall is
interrupted by a signal, we can determine exactly what
execution state we were in with respect to the execution of
the syscall by examining the value of %eip in the signal
handler. This means that we can always do the appropriate
thing to precisely emulate the kernel's signal/syscall
interactions.
The syscall number is taken from the argument, even though it
should also be in regs->m_eax. The syscall result is written
back to regs->m_eax on completion.
The syscall number is taken from the argument, even though it
should also be in regs->m_eax. The syscall result is written
back to regs->m_eax on completion.
Returns 0 if the syscall was successfully called (even if the
syscall itself failed), or a -ve error code if one of the
sigprocmasks failed (there's no way to determine which one
failed).
Returns 0 if the syscall was successfully called (even if the
syscall itself failed), or a -ve error code if one of the
sigprocmasks failed (there's no way to determine which one
failed).
VG_(fixup_guest_state_after_syscall_interrupted) does the
thread state fixup in the case where we were interrupted by a
signal.
VG_(fixup_guest_state_after_syscall_interrupted) does the
thread state fixup in the case where we were interrupted by a
signal.
Prototype:
Int ML_(do_syscall_for_client_WRK)(
Int syscallno, // ebp+8
void* guest_state, // ebp+12
const vki_sigset_t *sysmask, // ebp+16
const vki_sigset_t *postmask, // ebp+20
Int sigsetSzB) // ebp+24
Int ML_(do_syscall_for_client_WRK)(
Int syscallno, // ebp+8
void* guest_state, // ebp+12
const vki_sigset_t *sysmask, // ebp+16
const vki_sigset_t *postmask, // ebp+20
Int sigsetSzB) // ebp+24
Note that sigsetSzB is totally ignored (and irrelevant).
Note that sigsetSzB is totally ignored (and irrelevant).
*/
/* from vki-darwin.h, checked at startup by m_vki.c */
@ -80,101 +80,101 @@
.globl ML_(do_syscall_for_client_WRK)
ML_(do_syscall_for_client_WRK):
/* establish stack frame */
push %ebp
mov %esp, %ebp
subl $8, %esp /* 16-byte align stack */
push %ebp
mov %esp, %ebp
subl $8, %esp /* 16-byte align stack */
1: /* Even though we can't take a signal until the
sigprocmask completes, start the range early.
If eip is in the range [1,2), the syscall hasn't been started yet */
sigprocmask completes, start the range early.
If eip is in the range [1,2), the syscall hasn't been started yet */
/* Set the signal mask which should be current during the syscall. */
pushl 20(%ebp)
pushl 16(%ebp)
pushl $VKI_SIG_SETMASK
pushl $0xcafebabe /* totally fake return address */
movl $__NR_sigprocmask, %eax
int $0x80
jc 7f /* sigprocmask failed */
addl $16,%esp
/* Set the signal mask which should be current during the syscall. */
pushl 20(%ebp)
pushl 16(%ebp)
pushl $VKI_SIG_SETMASK
pushl $0xcafebabe /* totally fake return address */
movl $__NR_sigprocmask, %eax
int $0x80
jc 7f /* sigprocmask failed */
addl $16,%esp
/* Copy syscall parameters to the stack - assume no more than 8
* plus the return address */
/* do_syscall8 */
/* stack is currently aligned assuming 8 parameters */
movl 12(%ebp), %edx
movl OFFSET_x86_ESP(%edx), %edx /* edx = simulated ESP */
movl 28+4(%edx), %eax
pushl %eax
movl 24+4(%edx), %eax
pushl %eax
movl 20+4(%edx), %eax
pushl %eax
movl 16+4(%edx), %eax
pushl %eax
movl 12+4(%edx), %eax
pushl %eax
movl 8+4(%edx), %eax
pushl %eax
movl 4+4(%edx), %eax
pushl %eax
movl 0+4(%edx), %eax
pushl %eax
/* return address */
movl 0(%edx), %eax
pushl %eax
/* Copy syscall parameters to the stack - assume no more than 8
* plus the return address */
/* do_syscall8 */
/* stack is currently aligned assuming 8 parameters */
movl 12(%ebp), %edx
movl OFFSET_x86_ESP(%edx), %edx /* edx = simulated ESP */
movl 28+4(%edx), %eax
pushl %eax
movl 24+4(%edx), %eax
pushl %eax
movl 20+4(%edx), %eax
pushl %eax
movl 16+4(%edx), %eax
pushl %eax
movl 12+4(%edx), %eax
pushl %eax
movl 8+4(%edx), %eax
pushl %eax
movl 4+4(%edx), %eax
pushl %eax
movl 0+4(%edx), %eax
pushl %eax
/* return address */
movl 0(%edx), %eax
pushl %eax
/* Put syscall number in eax */
movl 8(%ebp), %eax
/* Put syscall number in eax */
movl 8(%ebp), %eax
/* If eip==2, then the syscall was either just about to start,
or was interrupted and the kernel was restarting it. */
2: int $0x80 /* UNIX (GrP fixme should be sysenter?) */
/* If eip==2, then the syscall was either just about to start,
or was interrupted and the kernel was restarting it. */
2: int $0x80 /* UNIX (GrP fixme should be sysenter?) */
3: /* In the range [3, 4), the syscall result is in %eax and %edx and C,
but hasn't been committed to the thread state. */
setc 0(%esp) /* stash returned carry flag */
movl 12(%ebp), %ecx
movl %eax, OFFSET_x86_EAX(%ecx) /* save EAX to vex */
movl %edx, OFFSET_x86_EDX(%ecx) /* save EDX to vex */
/* save carry flag to vex */
subl $12, %esp
movl %ecx, 4(%esp)
movl $0, 0(%esp)
movb 12(%esp), %al
movb %al, 0(%esp)
3: /* In the range [3, 4), the syscall result is in %eax and %edx and C,
but hasn't been committed to the thread state. */
setc 0(%esp) /* stash returned carry flag */
movl 12(%ebp), %ecx
movl %eax, OFFSET_x86_EAX(%ecx) /* save EAX to vex */
movl %edx, OFFSET_x86_EDX(%ecx) /* save EDX to vex */
/* save carry flag to vex */
subl $12, %esp
movl %ecx, 4(%esp)
movl $0, 0(%esp)
movb 12(%esp), %al
movb %al, 0(%esp)
movl $0x1, ML_(blksys_saving_cflag)
call LibVEX_GuestX86_put_eflag_c
call LibVEX_GuestX86_put_eflag_c
movl $0x0, ML_(blksys_saving_cflag)
addl $12, %esp
addl $12, %esp
4: /* Re-block signals. If eip is in [4,5), then the syscall is
complete and we needn't worry about it. */
/* Set up for __pthread_sigmask(SIG_SETMASK, postmask, NULL) */
pushl $0
pushl 20(%ebp)
pushl $VKI_SIG_SETMASK
pushl $0xcafef00d /* totally fake return address */
movl $__NR_sigprocmask, %eax
int $0x80 /* should be sysenter? */
jc 7f /* sigprocmask failed */
addl $16,%esp
4: /* Re-block signals. If eip is in [4,5), then the syscall is
complete and we needn't worry about it. */
/* Set up for __pthread_sigmask(SIG_SETMASK, postmask, NULL) */
pushl $0
pushl 20(%ebp)
pushl $VKI_SIG_SETMASK
pushl $0xcafef00d /* totally fake return address */
movl $__NR_sigprocmask, %eax
int $0x80 /* should be sysenter? */
jc 7f /* sigprocmask failed */
addl $16,%esp
5: /* now safe from signals */
movl $0, %eax /* SUCCESS */
movl %ebp, %esp
popl %ebp
ret
5: /* now safe from signals */
movl $0, %eax /* SUCCESS */
movl %ebp, %esp
popl %ebp
ret
7: /* failure: return 0x8000 | error code */
/* Note that we enter here with %esp being 16 too low
(4 extra words on the stack). But because we're nuking
the stack frame now, that doesn't matter. */
andl $0x7FFF, %eax
orl $0x8000, %eax
movl %ebp, %esp
popl %ebp
ret
7: /* failure: return 0x8000 | error code */
/* Note that we enter here with %esp being 16 too low
(4 extra words on the stack). But because we're nuking
the stack frame now, that doesn't matter. */
andl $0x7FFF, %eax
orl $0x8000, %eax
movl %ebp, %esp
popl %ebp
ret
.section .rodata
/* export the ranges so that

View File

@ -1763,7 +1763,7 @@ PRE(sys_seteuid)
// int stat(char *path, struct freebsd11_stat *sb);
PRE(sys_freebsd11_stat)
{
PRINT("sys_stat ( %#" FMT_REGWORD "x(%s), %#" FMT_REGWORD "x )",ARG1,(char *)ARG1,ARG2);
PRINT("sys_freebsd11_stat ( %#" FMT_REGWORD "x(%s), %#" FMT_REGWORD "x )",ARG1,(char *)ARG1,ARG2);
PRE_REG_READ2(int, "stat", char *, path, struct freebsd11_stat *, sb);
PRE_MEM_RASCIIZ( "stat(path)", ARG1 );
PRE_MEM_WRITE( "stat(sb)", ARG2, sizeof(struct vki_freebsd11_stat) );
@ -1792,7 +1792,7 @@ POST(sys_freebsd11_fstat)
// int lstat(const char * restrict path, struct stat * restrict sb);
PRE(sys_freebsd11_lstat)
{
PRINT("sys_lstat ( %#" FMT_REGWORD "x(%s), %#" FMT_REGWORD "x )",ARG1,(char *)ARG1,ARG2);
PRINT("sys_freebsd11_lstat ( %#" FMT_REGWORD "x(%s), %#" FMT_REGWORD "x )",ARG1,(char *)ARG1,ARG2);
PRE_REG_READ2(sb, "lstat", const char *, path, struct freebsd11_stat *, sb);
PRE_MEM_RASCIIZ( "lstat(path)", ARG1 );
PRE_MEM_WRITE( "lstat(sb)", ARG2, sizeof(struct vki_freebsd11_stat) );

View File

@ -585,6 +585,11 @@ void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs* canonical,
UWord *stack = (UWord *)gst->guest_RSP;
// FreeBSD supports different calling conventions
// @todo PJF this all seems over complicated to me
// SYSCALL_STD is OK but for the other
// two here we overwrite canonical->sysno with
// the final syscall number but then in do_syscall_for_client
// we switch real_syscallno back to __NR_syscall or __NR___syscall
switch (gst->guest_RAX) {
case __NR_syscall:
canonical->klass = VG_FREEBSD_SYSCALL0;
@ -595,7 +600,7 @@ void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs* canonical,
canonical->sysno = gst->guest_RDI;
break;
default:
canonical->klass = 0;
canonical->klass = VG_FREEBSD_SYSCALL_STD;
canonical->sysno = gst->guest_RAX;
break;
}

View File

@ -29,8 +29,17 @@
#include "config.h"
// this is the syscall format used by e.g., libc functions like 'write'
// this is the one used 99.999% of the time
// the two others are only for experimental or testing use
// (but we use them in the scalar tests).
#define VG_FREEBSD_SYSCALL_STD 0
// this is the syscall format used by 'syscall'
#define VG_FREEBSD_SYSCALL0 1
// this is the syscall format used by '__syscall'
// it is the same as VG_FREEBSD_SYSCALL0 except that
// it ensures that 64bit argument alignment is correct
// that makes no difference for amd64, x86 not sure
#define VG_FREEBSD_SYSCALL198 2
// From sys/syscall.h