1// Copyright 2009 The Go Authors. All rights reserved.
   2// Use of this source code is governed by a BSD-style
   3// license that can be found in the LICENSE file.
   4
   5// Linux system calls.
   6// This file is compiled as ordinary Go code,
   7// but it is also input to mksyscall,
   8// which parses the //sys lines and generates system call stubs.
   9// Note that sometimes we use a lowercase //sys name and
  10// wrap it in our own nicer implementation.
  11
  12package unix
  13
  14import (
  15	"encoding/binary"
  16	"slices"
  17	"strconv"
  18	"syscall"
  19	"time"
  20	"unsafe"
  21)
  22
  23/*
  24 * Wrapped
  25 */
  26
  27func Access(path string, mode uint32) (err error) {
  28	return Faccessat(AT_FDCWD, path, mode, 0)
  29}
  30
  31func Chmod(path string, mode uint32) (err error) {
  32	return Fchmodat(AT_FDCWD, path, mode, 0)
  33}
  34
  35func Chown(path string, uid int, gid int) (err error) {
  36	return Fchownat(AT_FDCWD, path, uid, gid, 0)
  37}
  38
  39func Creat(path string, mode uint32) (fd int, err error) {
  40	return Open(path, O_CREAT|O_WRONLY|O_TRUNC, mode)
  41}
  42
  43func EpollCreate(size int) (fd int, err error) {
  44	if size <= 0 {
  45		return -1, EINVAL
  46	}
  47	return EpollCreate1(0)
  48}
  49
  50//sys	FanotifyInit(flags uint, event_f_flags uint) (fd int, err error)
  51//sys	fanotifyMark(fd int, flags uint, mask uint64, dirFd int, pathname *byte) (err error)
  52
  53func FanotifyMark(fd int, flags uint, mask uint64, dirFd int, pathname string) (err error) {
  54	if pathname == "" {
  55		return fanotifyMark(fd, flags, mask, dirFd, nil)
  56	}
  57	p, err := BytePtrFromString(pathname)
  58	if err != nil {
  59		return err
  60	}
  61	return fanotifyMark(fd, flags, mask, dirFd, p)
  62}
  63
  64//sys	fchmodat(dirfd int, path string, mode uint32) (err error)
  65//sys	fchmodat2(dirfd int, path string, mode uint32, flags int) (err error)
  66
  67func Fchmodat(dirfd int, path string, mode uint32, flags int) error {
  68	// Linux fchmodat doesn't support the flags parameter, but fchmodat2 does.
  69	// Try fchmodat2 if flags are specified.
  70	if flags != 0 {
  71		err := fchmodat2(dirfd, path, mode, flags)
  72		if err == ENOSYS {
  73			// fchmodat2 isn't available. If the flags are known to be valid,
  74			// return EOPNOTSUPP to indicate that fchmodat doesn't support them.
  75			if flags&^(AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH) != 0 {
  76				return EINVAL
  77			} else if flags&(AT_SYMLINK_NOFOLLOW|AT_EMPTY_PATH) != 0 {
  78				return EOPNOTSUPP
  79			}
  80		}
  81		return err
  82	}
  83	return fchmodat(dirfd, path, mode)
  84}
  85
  86func InotifyInit() (fd int, err error) {
  87	return InotifyInit1(0)
  88}
  89
  90//sys	ioctl(fd int, req uint, arg uintptr) (err error) = SYS_IOCTL
  91//sys	ioctlPtr(fd int, req uint, arg unsafe.Pointer) (err error) = SYS_IOCTL
  92
  93// ioctl itself should not be exposed directly, but additional get/set functions
  94// for specific types are permissible. These are defined in ioctl.go and
  95// ioctl_linux.go.
  96//
  97// The third argument to ioctl is often a pointer but sometimes an integer.
  98// Callers should use ioctlPtr when the third argument is a pointer and ioctl
  99// when the third argument is an integer.
 100//
 101// TODO: some existing code incorrectly uses ioctl when it should use ioctlPtr.
 102
 103//sys	Linkat(olddirfd int, oldpath string, newdirfd int, newpath string, flags int) (err error)
 104
 105func Link(oldpath string, newpath string) (err error) {
 106	return Linkat(AT_FDCWD, oldpath, AT_FDCWD, newpath, 0)
 107}
 108
 109func Mkdir(path string, mode uint32) (err error) {
 110	return Mkdirat(AT_FDCWD, path, mode)
 111}
 112
 113func Mknod(path string, mode uint32, dev int) (err error) {
 114	return Mknodat(AT_FDCWD, path, mode, dev)
 115}
 116
 117func Open(path string, mode int, perm uint32) (fd int, err error) {
 118	return openat(AT_FDCWD, path, mode|O_LARGEFILE, perm)
 119}
 120
 121//sys	openat(dirfd int, path string, flags int, mode uint32) (fd int, err error)
 122
 123func Openat(dirfd int, path string, flags int, mode uint32) (fd int, err error) {
 124	return openat(dirfd, path, flags|O_LARGEFILE, mode)
 125}
 126
 127//sys	openat2(dirfd int, path string, open_how *OpenHow, size int) (fd int, err error)
 128
 129func Openat2(dirfd int, path string, how *OpenHow) (fd int, err error) {
 130	return openat2(dirfd, path, how, SizeofOpenHow)
 131}
 132
 133func Pipe(p []int) error {
 134	return Pipe2(p, 0)
 135}
 136
 137//sysnb	pipe2(p *[2]_C_int, flags int) (err error)
 138
 139func Pipe2(p []int, flags int) error {
 140	if len(p) != 2 {
 141		return EINVAL
 142	}
 143	var pp [2]_C_int
 144	err := pipe2(&pp, flags)
 145	if err == nil {
 146		p[0] = int(pp[0])
 147		p[1] = int(pp[1])
 148	}
 149	return err
 150}
 151
 152//sys	ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error)
 153
 154func Ppoll(fds []PollFd, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
 155	if len(fds) == 0 {
 156		return ppoll(nil, 0, timeout, sigmask)
 157	}
 158	return ppoll(&fds[0], len(fds), timeout, sigmask)
 159}
 160
 161func Poll(fds []PollFd, timeout int) (n int, err error) {
 162	var ts *Timespec
 163	if timeout >= 0 {
 164		ts = new(Timespec)
 165		*ts = NsecToTimespec(int64(timeout) * 1e6)
 166	}
 167	return Ppoll(fds, ts, nil)
 168}
 169
 170//sys	Readlinkat(dirfd int, path string, buf []byte) (n int, err error)
 171
 172func Readlink(path string, buf []byte) (n int, err error) {
 173	return Readlinkat(AT_FDCWD, path, buf)
 174}
 175
 176func Rename(oldpath string, newpath string) (err error) {
 177	return Renameat(AT_FDCWD, oldpath, AT_FDCWD, newpath)
 178}
 179
 180func Rmdir(path string) error {
 181	return Unlinkat(AT_FDCWD, path, AT_REMOVEDIR)
 182}
 183
 184//sys	Symlinkat(oldpath string, newdirfd int, newpath string) (err error)
 185
 186func Symlink(oldpath string, newpath string) (err error) {
 187	return Symlinkat(oldpath, AT_FDCWD, newpath)
 188}
 189
 190func Unlink(path string) error {
 191	return Unlinkat(AT_FDCWD, path, 0)
 192}
 193
 194//sys	Unlinkat(dirfd int, path string, flags int) (err error)
 195
 196func Utimes(path string, tv []Timeval) error {
 197	if tv == nil {
 198		err := utimensat(AT_FDCWD, path, nil, 0)
 199		if err != ENOSYS {
 200			return err
 201		}
 202		return utimes(path, nil)
 203	}
 204	if len(tv) != 2 {
 205		return EINVAL
 206	}
 207	var ts [2]Timespec
 208	ts[0] = NsecToTimespec(TimevalToNsec(tv[0]))
 209	ts[1] = NsecToTimespec(TimevalToNsec(tv[1]))
 210	err := utimensat(AT_FDCWD, path, (*[2]Timespec)(unsafe.Pointer(&ts[0])), 0)
 211	if err != ENOSYS {
 212		return err
 213	}
 214	return utimes(path, (*[2]Timeval)(unsafe.Pointer(&tv[0])))
 215}
 216
 217//sys	utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error)
 218
 219func UtimesNano(path string, ts []Timespec) error {
 220	return UtimesNanoAt(AT_FDCWD, path, ts, 0)
 221}
 222
 223func UtimesNanoAt(dirfd int, path string, ts []Timespec, flags int) error {
 224	if ts == nil {
 225		return utimensat(dirfd, path, nil, flags)
 226	}
 227	if len(ts) != 2 {
 228		return EINVAL
 229	}
 230	return utimensat(dirfd, path, (*[2]Timespec)(unsafe.Pointer(&ts[0])), flags)
 231}
 232
 233func Futimesat(dirfd int, path string, tv []Timeval) error {
 234	if tv == nil {
 235		return futimesat(dirfd, path, nil)
 236	}
 237	if len(tv) != 2 {
 238		return EINVAL
 239	}
 240	return futimesat(dirfd, path, (*[2]Timeval)(unsafe.Pointer(&tv[0])))
 241}
 242
 243func Futimes(fd int, tv []Timeval) (err error) {
 244	// Believe it or not, this is the best we can do on Linux
 245	// (and is what glibc does).
 246	return Utimes("/proc/self/fd/"+strconv.Itoa(fd), tv)
 247}
 248
 249const ImplementsGetwd = true
 250
 251//sys	Getcwd(buf []byte) (n int, err error)
 252
 253func Getwd() (wd string, err error) {
 254	var buf [PathMax]byte
 255	n, err := Getcwd(buf[0:])
 256	if err != nil {
 257		return "", err
 258	}
 259	// Getcwd returns the number of bytes written to buf, including the NUL.
 260	if n < 1 || n > len(buf) || buf[n-1] != 0 {
 261		return "", EINVAL
 262	}
 263	// In some cases, Linux can return a path that starts with the
 264	// "(unreachable)" prefix, which can potentially be a valid relative
 265	// path. To work around that, return ENOENT if path is not absolute.
 266	if buf[0] != '/' {
 267		return "", ENOENT
 268	}
 269
 270	return string(buf[0 : n-1]), nil
 271}
 272
 273func Getgroups() (gids []int, err error) {
 274	n, err := getgroups(0, nil)
 275	if err != nil {
 276		return nil, err
 277	}
 278	if n == 0 {
 279		return nil, nil
 280	}
 281
 282	// Sanity check group count. Max is 1<<16 on Linux.
 283	if n < 0 || n > 1<<20 {
 284		return nil, EINVAL
 285	}
 286
 287	a := make([]_Gid_t, n)
 288	n, err = getgroups(n, &a[0])
 289	if err != nil {
 290		return nil, err
 291	}
 292	gids = make([]int, n)
 293	for i, v := range a[0:n] {
 294		gids[i] = int(v)
 295	}
 296	return
 297}
 298
 299func Setgroups(gids []int) (err error) {
 300	if len(gids) == 0 {
 301		return setgroups(0, nil)
 302	}
 303
 304	a := make([]_Gid_t, len(gids))
 305	for i, v := range gids {
 306		a[i] = _Gid_t(v)
 307	}
 308	return setgroups(len(a), &a[0])
 309}
 310
 311type WaitStatus uint32
 312
 313// Wait status is 7 bits at bottom, either 0 (exited),
 314// 0x7F (stopped), or a signal number that caused an exit.
 315// The 0x80 bit is whether there was a core dump.
 316// An extra number (exit code, signal causing a stop)
 317// is in the high bits. At least that's the idea.
 318// There are various irregularities. For example, the
 319// "continued" status is 0xFFFF, distinguishing itself
 320// from stopped via the core dump bit.
 321
 322const (
 323	mask    = 0x7F
 324	core    = 0x80
 325	exited  = 0x00
 326	stopped = 0x7F
 327	shift   = 8
 328)
 329
 330func (w WaitStatus) Exited() bool { return w&mask == exited }
 331
 332func (w WaitStatus) Signaled() bool { return w&mask != stopped && w&mask != exited }
 333
 334func (w WaitStatus) Stopped() bool { return w&0xFF == stopped }
 335
 336func (w WaitStatus) Continued() bool { return w == 0xFFFF }
 337
 338func (w WaitStatus) CoreDump() bool { return w.Signaled() && w&core != 0 }
 339
 340func (w WaitStatus) ExitStatus() int {
 341	if !w.Exited() {
 342		return -1
 343	}
 344	return int(w>>shift) & 0xFF
 345}
 346
 347func (w WaitStatus) Signal() syscall.Signal {
 348	if !w.Signaled() {
 349		return -1
 350	}
 351	return syscall.Signal(w & mask)
 352}
 353
 354func (w WaitStatus) StopSignal() syscall.Signal {
 355	if !w.Stopped() {
 356		return -1
 357	}
 358	return syscall.Signal(w>>shift) & 0xFF
 359}
 360
 361func (w WaitStatus) TrapCause() int {
 362	if w.StopSignal() != SIGTRAP {
 363		return -1
 364	}
 365	return int(w>>shift) >> 8
 366}
 367
 368//sys	wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err error)
 369
 370func Wait4(pid int, wstatus *WaitStatus, options int, rusage *Rusage) (wpid int, err error) {
 371	var status _C_int
 372	wpid, err = wait4(pid, &status, options, rusage)
 373	if wstatus != nil {
 374		*wstatus = WaitStatus(status)
 375	}
 376	return
 377}
 378
 379//sys	Waitid(idType int, id int, info *Siginfo, options int, rusage *Rusage) (err error)
 380
 381func Mkfifo(path string, mode uint32) error {
 382	return Mknod(path, mode|S_IFIFO, 0)
 383}
 384
 385func Mkfifoat(dirfd int, path string, mode uint32) error {
 386	return Mknodat(dirfd, path, mode|S_IFIFO, 0)
 387}
 388
 389func (sa *SockaddrInet4) sockaddr() (unsafe.Pointer, _Socklen, error) {
 390	if sa.Port < 0 || sa.Port > 0xFFFF {
 391		return nil, 0, EINVAL
 392	}
 393	sa.raw.Family = AF_INET
 394	p := (*[2]byte)(unsafe.Pointer(&sa.raw.Port))
 395	p[0] = byte(sa.Port >> 8)
 396	p[1] = byte(sa.Port)
 397	sa.raw.Addr = sa.Addr
 398	return unsafe.Pointer(&sa.raw), SizeofSockaddrInet4, nil
 399}
 400
 401func (sa *SockaddrInet6) sockaddr() (unsafe.Pointer, _Socklen, error) {
 402	if sa.Port < 0 || sa.Port > 0xFFFF {
 403		return nil, 0, EINVAL
 404	}
 405	sa.raw.Family = AF_INET6
 406	p := (*[2]byte)(unsafe.Pointer(&sa.raw.Port))
 407	p[0] = byte(sa.Port >> 8)
 408	p[1] = byte(sa.Port)
 409	sa.raw.Scope_id = sa.ZoneId
 410	sa.raw.Addr = sa.Addr
 411	return unsafe.Pointer(&sa.raw), SizeofSockaddrInet6, nil
 412}
 413
 414func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, _Socklen, error) {
 415	name := sa.Name
 416	n := len(name)
 417	if n >= len(sa.raw.Path) {
 418		return nil, 0, EINVAL
 419	}
 420	sa.raw.Family = AF_UNIX
 421	for i := range n {
 422		sa.raw.Path[i] = int8(name[i])
 423	}
 424	// length is family (uint16), name, NUL.
 425	sl := _Socklen(2)
 426	if n > 0 {
 427		sl += _Socklen(n) + 1
 428	}
 429	if sa.raw.Path[0] == '@' || (sa.raw.Path[0] == 0 && sl > 3) {
 430		// Check sl > 3 so we don't change unnamed socket behavior.
 431		sa.raw.Path[0] = 0
 432		// Don't count trailing NUL for abstract address.
 433		sl--
 434	}
 435
 436	return unsafe.Pointer(&sa.raw), sl, nil
 437}
 438
 439// SockaddrLinklayer implements the Sockaddr interface for AF_PACKET type sockets.
 440type SockaddrLinklayer struct {
 441	Protocol uint16
 442	Ifindex  int
 443	Hatype   uint16
 444	Pkttype  uint8
 445	Halen    uint8
 446	Addr     [8]byte
 447	raw      RawSockaddrLinklayer
 448}
 449
 450func (sa *SockaddrLinklayer) sockaddr() (unsafe.Pointer, _Socklen, error) {
 451	if sa.Ifindex < 0 || sa.Ifindex > 0x7fffffff {
 452		return nil, 0, EINVAL
 453	}
 454	sa.raw.Family = AF_PACKET
 455	sa.raw.Protocol = sa.Protocol
 456	sa.raw.Ifindex = int32(sa.Ifindex)
 457	sa.raw.Hatype = sa.Hatype
 458	sa.raw.Pkttype = sa.Pkttype
 459	sa.raw.Halen = sa.Halen
 460	sa.raw.Addr = sa.Addr
 461	return unsafe.Pointer(&sa.raw), SizeofSockaddrLinklayer, nil
 462}
 463
 464// SockaddrNetlink implements the Sockaddr interface for AF_NETLINK type sockets.
 465type SockaddrNetlink struct {
 466	Family uint16
 467	Pad    uint16
 468	Pid    uint32
 469	Groups uint32
 470	raw    RawSockaddrNetlink
 471}
 472
 473func (sa *SockaddrNetlink) sockaddr() (unsafe.Pointer, _Socklen, error) {
 474	sa.raw.Family = AF_NETLINK
 475	sa.raw.Pad = sa.Pad
 476	sa.raw.Pid = sa.Pid
 477	sa.raw.Groups = sa.Groups
 478	return unsafe.Pointer(&sa.raw), SizeofSockaddrNetlink, nil
 479}
 480
 481// SockaddrHCI implements the Sockaddr interface for AF_BLUETOOTH type sockets
 482// using the HCI protocol.
 483type SockaddrHCI struct {
 484	Dev     uint16
 485	Channel uint16
 486	raw     RawSockaddrHCI
 487}
 488
 489func (sa *SockaddrHCI) sockaddr() (unsafe.Pointer, _Socklen, error) {
 490	sa.raw.Family = AF_BLUETOOTH
 491	sa.raw.Dev = sa.Dev
 492	sa.raw.Channel = sa.Channel
 493	return unsafe.Pointer(&sa.raw), SizeofSockaddrHCI, nil
 494}
 495
 496// SockaddrL2 implements the Sockaddr interface for AF_BLUETOOTH type sockets
 497// using the L2CAP protocol.
 498type SockaddrL2 struct {
 499	PSM      uint16
 500	CID      uint16
 501	Addr     [6]uint8
 502	AddrType uint8
 503	raw      RawSockaddrL2
 504}
 505
 506func (sa *SockaddrL2) sockaddr() (unsafe.Pointer, _Socklen, error) {
 507	sa.raw.Family = AF_BLUETOOTH
 508	psm := (*[2]byte)(unsafe.Pointer(&sa.raw.Psm))
 509	psm[0] = byte(sa.PSM)
 510	psm[1] = byte(sa.PSM >> 8)
 511	for i := range len(sa.Addr) {
 512		sa.raw.Bdaddr[i] = sa.Addr[len(sa.Addr)-1-i]
 513	}
 514	cid := (*[2]byte)(unsafe.Pointer(&sa.raw.Cid))
 515	cid[0] = byte(sa.CID)
 516	cid[1] = byte(sa.CID >> 8)
 517	sa.raw.Bdaddr_type = sa.AddrType
 518	return unsafe.Pointer(&sa.raw), SizeofSockaddrL2, nil
 519}
 520
 521// SockaddrRFCOMM implements the Sockaddr interface for AF_BLUETOOTH type sockets
 522// using the RFCOMM protocol.
 523//
 524// Server example:
 525//
 526//	fd, _ := Socket(AF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM)
 527//	_ = unix.Bind(fd, &unix.SockaddrRFCOMM{
 528//		Channel: 1,
 529//		Addr:    [6]uint8{0, 0, 0, 0, 0, 0}, // BDADDR_ANY or 00:00:00:00:00:00
 530//	})
 531//	_ = Listen(fd, 1)
 532//	nfd, sa, _ := Accept(fd)
 533//	fmt.Printf("conn addr=%v fd=%d", sa.(*unix.SockaddrRFCOMM).Addr, nfd)
 534//	Read(nfd, buf)
 535//
 536// Client example:
 537//
 538//	fd, _ := Socket(AF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM)
 539//	_ = Connect(fd, &SockaddrRFCOMM{
 540//		Channel: 1,
 541//		Addr:    [6]byte{0x11, 0x22, 0x33, 0xaa, 0xbb, 0xcc}, // CC:BB:AA:33:22:11
 542//	})
 543//	Write(fd, []byte(`hello`))
 544type SockaddrRFCOMM struct {
 545	// Addr represents a bluetooth address, byte ordering is little-endian.
 546	Addr [6]uint8
 547
 548	// Channel is a designated bluetooth channel, only 1-30 are available for use.
 549	// Since Linux 2.6.7 and further zero value is the first available channel.
 550	Channel uint8
 551
 552	raw RawSockaddrRFCOMM
 553}
 554
 555func (sa *SockaddrRFCOMM) sockaddr() (unsafe.Pointer, _Socklen, error) {
 556	sa.raw.Family = AF_BLUETOOTH
 557	sa.raw.Channel = sa.Channel
 558	sa.raw.Bdaddr = sa.Addr
 559	return unsafe.Pointer(&sa.raw), SizeofSockaddrRFCOMM, nil
 560}
 561
 562// SockaddrCAN implements the Sockaddr interface for AF_CAN type sockets.
 563// The RxID and TxID fields are used for transport protocol addressing in
 564// (CAN_TP16, CAN_TP20, CAN_MCNET, and CAN_ISOTP), they can be left with
 565// zero values for CAN_RAW and CAN_BCM sockets as they have no meaning.
 566//
 567// The SockaddrCAN struct must be bound to the socket file descriptor
 568// using Bind before the CAN socket can be used.
 569//
 570//	// Read one raw CAN frame
 571//	fd, _ := Socket(AF_CAN, SOCK_RAW, CAN_RAW)
 572//	addr := &SockaddrCAN{Ifindex: index}
 573//	Bind(fd, addr)
 574//	frame := make([]byte, 16)
 575//	Read(fd, frame)
 576//
 577// The full SocketCAN documentation can be found in the linux kernel
 578// archives at: https://www.kernel.org/doc/Documentation/networking/can.txt
 579type SockaddrCAN struct {
 580	Ifindex int
 581	RxID    uint32
 582	TxID    uint32
 583	raw     RawSockaddrCAN
 584}
 585
 586func (sa *SockaddrCAN) sockaddr() (unsafe.Pointer, _Socklen, error) {
 587	if sa.Ifindex < 0 || sa.Ifindex > 0x7fffffff {
 588		return nil, 0, EINVAL
 589	}
 590	sa.raw.Family = AF_CAN
 591	sa.raw.Ifindex = int32(sa.Ifindex)
 592	rx := (*[4]byte)(unsafe.Pointer(&sa.RxID))
 593	for i := range 4 {
 594		sa.raw.Addr[i] = rx[i]
 595	}
 596	tx := (*[4]byte)(unsafe.Pointer(&sa.TxID))
 597	for i := range 4 {
 598		sa.raw.Addr[i+4] = tx[i]
 599	}
 600	return unsafe.Pointer(&sa.raw), SizeofSockaddrCAN, nil
 601}
 602
 603// SockaddrCANJ1939 implements the Sockaddr interface for AF_CAN using J1939
 604// protocol (https://en.wikipedia.org/wiki/SAE_J1939). For more information
 605// on the purposes of the fields, check the official linux kernel documentation
 606// available here: https://www.kernel.org/doc/Documentation/networking/j1939.rst
 607type SockaddrCANJ1939 struct {
 608	Ifindex int
 609	Name    uint64
 610	PGN     uint32
 611	Addr    uint8
 612	raw     RawSockaddrCAN
 613}
 614
 615func (sa *SockaddrCANJ1939) sockaddr() (unsafe.Pointer, _Socklen, error) {
 616	if sa.Ifindex < 0 || sa.Ifindex > 0x7fffffff {
 617		return nil, 0, EINVAL
 618	}
 619	sa.raw.Family = AF_CAN
 620	sa.raw.Ifindex = int32(sa.Ifindex)
 621	n := (*[8]byte)(unsafe.Pointer(&sa.Name))
 622	for i := range 8 {
 623		sa.raw.Addr[i] = n[i]
 624	}
 625	p := (*[4]byte)(unsafe.Pointer(&sa.PGN))
 626	for i := range 4 {
 627		sa.raw.Addr[i+8] = p[i]
 628	}
 629	sa.raw.Addr[12] = sa.Addr
 630	return unsafe.Pointer(&sa.raw), SizeofSockaddrCAN, nil
 631}
 632
 633// SockaddrALG implements the Sockaddr interface for AF_ALG type sockets.
 634// SockaddrALG enables userspace access to the Linux kernel's cryptography
 635// subsystem. The Type and Name fields specify which type of hash or cipher
 636// should be used with a given socket.
 637//
 638// To create a file descriptor that provides access to a hash or cipher, both
 639// Bind and Accept must be used. Once the setup process is complete, input
 640// data can be written to the socket, processed by the kernel, and then read
 641// back as hash output or ciphertext.
 642//
 643// Here is an example of using an AF_ALG socket with SHA1 hashing.
 644// The initial socket setup process is as follows:
 645//
 646//	// Open a socket to perform SHA1 hashing.
 647//	fd, _ := unix.Socket(unix.AF_ALG, unix.SOCK_SEQPACKET, 0)
 648//	addr := &unix.SockaddrALG{Type: "hash", Name: "sha1"}
 649//	unix.Bind(fd, addr)
 650//	// Note: unix.Accept does not work at this time; must invoke accept()
 651//	// manually using unix.Syscall.
 652//	hashfd, _, _ := unix.Syscall(unix.SYS_ACCEPT, uintptr(fd), 0, 0)
 653//
 654// Once a file descriptor has been returned from Accept, it may be used to
 655// perform SHA1 hashing. The descriptor is not safe for concurrent use, but
 656// may be re-used repeatedly with subsequent Write and Read operations.
 657//
 658// When hashing a small byte slice or string, a single Write and Read may
 659// be used:
 660//
 661//	// Assume hashfd is already configured using the setup process.
 662//	hash := os.NewFile(hashfd, "sha1")
 663//	// Hash an input string and read the results. Each Write discards
 664//	// previous hash state. Read always reads the current state.
 665//	b := make([]byte, 20)
 666//	for i := 0; i < 2; i++ {
 667//	    io.WriteString(hash, "Hello, world.")
 668//	    hash.Read(b)
 669//	    fmt.Println(hex.EncodeToString(b))
 670//	}
 671//	// Output:
 672//	// 2ae01472317d1935a84797ec1983ae243fc6aa28
 673//	// 2ae01472317d1935a84797ec1983ae243fc6aa28
 674//
 675// For hashing larger byte slices, or byte streams such as those read from
 676// a file or socket, use Sendto with MSG_MORE to instruct the kernel to update
 677// the hash digest instead of creating a new one for a given chunk and finalizing it.
 678//
 679//	// Assume hashfd and addr are already configured using the setup process.
 680//	hash := os.NewFile(hashfd, "sha1")
 681//	// Hash the contents of a file.
 682//	f, _ := os.Open("/tmp/linux-4.10-rc7.tar.xz")
 683//	b := make([]byte, 4096)
 684//	for {
 685//	    n, err := f.Read(b)
 686//	    if err == io.EOF {
 687//	        break
 688//	    }
 689//	    unix.Sendto(hashfd, b[:n], unix.MSG_MORE, addr)
 690//	}
 691//	hash.Read(b)
 692//	fmt.Println(hex.EncodeToString(b))
 693//	// Output: 85cdcad0c06eef66f805ecce353bec9accbeecc5
 694//
 695// For more information, see: http://www.chronox.de/crypto-API/crypto/userspace-if.html.
 696type SockaddrALG struct {
 697	Type    string
 698	Name    string
 699	Feature uint32
 700	Mask    uint32
 701	raw     RawSockaddrALG
 702}
 703
 704func (sa *SockaddrALG) sockaddr() (unsafe.Pointer, _Socklen, error) {
 705	// Leave room for NUL byte terminator.
 706	if len(sa.Type) > len(sa.raw.Type)-1 {
 707		return nil, 0, EINVAL
 708	}
 709	if len(sa.Name) > len(sa.raw.Name)-1 {
 710		return nil, 0, EINVAL
 711	}
 712
 713	sa.raw.Family = AF_ALG
 714	sa.raw.Feat = sa.Feature
 715	sa.raw.Mask = sa.Mask
 716
 717	copy(sa.raw.Type[:], sa.Type)
 718	copy(sa.raw.Name[:], sa.Name)
 719
 720	return unsafe.Pointer(&sa.raw), SizeofSockaddrALG, nil
 721}
 722
 723// SockaddrVM implements the Sockaddr interface for AF_VSOCK type sockets.
 724// SockaddrVM provides access to Linux VM sockets: a mechanism that enables
 725// bidirectional communication between a hypervisor and its guest virtual
 726// machines.
 727type SockaddrVM struct {
 728	// CID and Port specify a context ID and port address for a VM socket.
 729	// Guests have a unique CID, and hosts may have a well-known CID of:
 730	//  - VMADDR_CID_HYPERVISOR: refers to the hypervisor process.
 731	//  - VMADDR_CID_LOCAL: refers to local communication (loopback).
 732	//  - VMADDR_CID_HOST: refers to other processes on the host.
 733	CID   uint32
 734	Port  uint32
 735	Flags uint8
 736	raw   RawSockaddrVM
 737}
 738
 739func (sa *SockaddrVM) sockaddr() (unsafe.Pointer, _Socklen, error) {
 740	sa.raw.Family = AF_VSOCK
 741	sa.raw.Port = sa.Port
 742	sa.raw.Cid = sa.CID
 743	sa.raw.Flags = sa.Flags
 744
 745	return unsafe.Pointer(&sa.raw), SizeofSockaddrVM, nil
 746}
 747
 748type SockaddrXDP struct {
 749	Flags        uint16
 750	Ifindex      uint32
 751	QueueID      uint32
 752	SharedUmemFD uint32
 753	raw          RawSockaddrXDP
 754}
 755
 756func (sa *SockaddrXDP) sockaddr() (unsafe.Pointer, _Socklen, error) {
 757	sa.raw.Family = AF_XDP
 758	sa.raw.Flags = sa.Flags
 759	sa.raw.Ifindex = sa.Ifindex
 760	sa.raw.Queue_id = sa.QueueID
 761	sa.raw.Shared_umem_fd = sa.SharedUmemFD
 762
 763	return unsafe.Pointer(&sa.raw), SizeofSockaddrXDP, nil
 764}
 765
 766// This constant mirrors the #define of PX_PROTO_OE in
 767// linux/if_pppox.h. We're defining this by hand here instead of
 768// autogenerating through mkerrors.sh because including
 769// linux/if_pppox.h causes some declaration conflicts with other
 770// includes (linux/if_pppox.h includes linux/in.h, which conflicts
 771// with netinet/in.h). Given that we only need a single zero constant
 772// out of that file, it's cleaner to just define it by hand here.
 773const px_proto_oe = 0
 774
 775type SockaddrPPPoE struct {
 776	SID    uint16
 777	Remote []byte
 778	Dev    string
 779	raw    RawSockaddrPPPoX
 780}
 781
 782func (sa *SockaddrPPPoE) sockaddr() (unsafe.Pointer, _Socklen, error) {
 783	if len(sa.Remote) != 6 {
 784		return nil, 0, EINVAL
 785	}
 786	if len(sa.Dev) > IFNAMSIZ-1 {
 787		return nil, 0, EINVAL
 788	}
 789
 790	*(*uint16)(unsafe.Pointer(&sa.raw[0])) = AF_PPPOX
 791	// This next field is in host-endian byte order. We can't use the
 792	// same unsafe pointer cast as above, because this value is not
 793	// 32-bit aligned and some architectures don't allow unaligned
 794	// access.
 795	//
 796	// However, the value of px_proto_oe is 0, so we can use
 797	// encoding/binary helpers to write the bytes without worrying
 798	// about the ordering.
 799	binary.BigEndian.PutUint32(sa.raw[2:6], px_proto_oe)
 800	// This field is deliberately big-endian, unlike the previous
 801	// one. The kernel expects SID to be in network byte order.
 802	binary.BigEndian.PutUint16(sa.raw[6:8], sa.SID)
 803	copy(sa.raw[8:14], sa.Remote)
 804	clear(sa.raw[14 : 14+IFNAMSIZ])
 805	copy(sa.raw[14:], sa.Dev)
 806	return unsafe.Pointer(&sa.raw), SizeofSockaddrPPPoX, nil
 807}
 808
 809// SockaddrTIPC implements the Sockaddr interface for AF_TIPC type sockets.
 810// For more information on TIPC, see: http://tipc.sourceforge.net/.
 811type SockaddrTIPC struct {
 812	// Scope is the publication scopes when binding service/service range.
 813	// Should be set to TIPC_CLUSTER_SCOPE or TIPC_NODE_SCOPE.
 814	Scope int
 815
 816	// Addr is the type of address used to manipulate a socket. Addr must be
 817	// one of:
 818	//  - *TIPCSocketAddr: "id" variant in the C addr union
 819	//  - *TIPCServiceRange: "nameseq" variant in the C addr union
 820	//  - *TIPCServiceName: "name" variant in the C addr union
 821	//
 822	// If nil, EINVAL will be returned when the structure is used.
 823	Addr TIPCAddr
 824
 825	raw RawSockaddrTIPC
 826}
 827
 828// TIPCAddr is implemented by types that can be used as an address for
 829// SockaddrTIPC. It is only implemented by *TIPCSocketAddr, *TIPCServiceRange,
 830// and *TIPCServiceName.
 831type TIPCAddr interface {
 832	tipcAddrtype() uint8
 833	tipcAddr() [12]byte
 834}
 835
 836func (sa *TIPCSocketAddr) tipcAddr() [12]byte {
 837	var out [12]byte
 838	copy(out[:], (*(*[unsafe.Sizeof(TIPCSocketAddr{})]byte)(unsafe.Pointer(sa)))[:])
 839	return out
 840}
 841
 842func (sa *TIPCSocketAddr) tipcAddrtype() uint8 { return TIPC_SOCKET_ADDR }
 843
 844func (sa *TIPCServiceRange) tipcAddr() [12]byte {
 845	var out [12]byte
 846	copy(out[:], (*(*[unsafe.Sizeof(TIPCServiceRange{})]byte)(unsafe.Pointer(sa)))[:])
 847	return out
 848}
 849
 850func (sa *TIPCServiceRange) tipcAddrtype() uint8 { return TIPC_SERVICE_RANGE }
 851
 852func (sa *TIPCServiceName) tipcAddr() [12]byte {
 853	var out [12]byte
 854	copy(out[:], (*(*[unsafe.Sizeof(TIPCServiceName{})]byte)(unsafe.Pointer(sa)))[:])
 855	return out
 856}
 857
 858func (sa *TIPCServiceName) tipcAddrtype() uint8 { return TIPC_SERVICE_ADDR }
 859
 860func (sa *SockaddrTIPC) sockaddr() (unsafe.Pointer, _Socklen, error) {
 861	if sa.Addr == nil {
 862		return nil, 0, EINVAL
 863	}
 864	sa.raw.Family = AF_TIPC
 865	sa.raw.Scope = int8(sa.Scope)
 866	sa.raw.Addrtype = sa.Addr.tipcAddrtype()
 867	sa.raw.Addr = sa.Addr.tipcAddr()
 868	return unsafe.Pointer(&sa.raw), SizeofSockaddrTIPC, nil
 869}
 870
 871// SockaddrL2TPIP implements the Sockaddr interface for IPPROTO_L2TP/AF_INET sockets.
 872type SockaddrL2TPIP struct {
 873	Addr   [4]byte
 874	ConnId uint32
 875	raw    RawSockaddrL2TPIP
 876}
 877
 878func (sa *SockaddrL2TPIP) sockaddr() (unsafe.Pointer, _Socklen, error) {
 879	sa.raw.Family = AF_INET
 880	sa.raw.Conn_id = sa.ConnId
 881	sa.raw.Addr = sa.Addr
 882	return unsafe.Pointer(&sa.raw), SizeofSockaddrL2TPIP, nil
 883}
 884
 885// SockaddrL2TPIP6 implements the Sockaddr interface for IPPROTO_L2TP/AF_INET6 sockets.
 886type SockaddrL2TPIP6 struct {
 887	Addr   [16]byte
 888	ZoneId uint32
 889	ConnId uint32
 890	raw    RawSockaddrL2TPIP6
 891}
 892
 893func (sa *SockaddrL2TPIP6) sockaddr() (unsafe.Pointer, _Socklen, error) {
 894	sa.raw.Family = AF_INET6
 895	sa.raw.Conn_id = sa.ConnId
 896	sa.raw.Scope_id = sa.ZoneId
 897	sa.raw.Addr = sa.Addr
 898	return unsafe.Pointer(&sa.raw), SizeofSockaddrL2TPIP6, nil
 899}
 900
 901// SockaddrIUCV implements the Sockaddr interface for AF_IUCV sockets.
 902type SockaddrIUCV struct {
 903	UserID string
 904	Name   string
 905	raw    RawSockaddrIUCV
 906}
 907
 908func (sa *SockaddrIUCV) sockaddr() (unsafe.Pointer, _Socklen, error) {
 909	sa.raw.Family = AF_IUCV
 910	// These are EBCDIC encoded by the kernel, but we still need to pad them
 911	// with blanks. Initializing with blanks allows the caller to feed in either
 912	// a padded or an unpadded string.
 913	for i := range 8 {
 914		sa.raw.Nodeid[i] = ' '
 915		sa.raw.User_id[i] = ' '
 916		sa.raw.Name[i] = ' '
 917	}
 918	if len(sa.UserID) > 8 || len(sa.Name) > 8 {
 919		return nil, 0, EINVAL
 920	}
 921	for i, b := range []byte(sa.UserID[:]) {
 922		sa.raw.User_id[i] = int8(b)
 923	}
 924	for i, b := range []byte(sa.Name[:]) {
 925		sa.raw.Name[i] = int8(b)
 926	}
 927	return unsafe.Pointer(&sa.raw), SizeofSockaddrIUCV, nil
 928}
 929
 930type SockaddrNFC struct {
 931	DeviceIdx   uint32
 932	TargetIdx   uint32
 933	NFCProtocol uint32
 934	raw         RawSockaddrNFC
 935}
 936
 937func (sa *SockaddrNFC) sockaddr() (unsafe.Pointer, _Socklen, error) {
 938	sa.raw.Sa_family = AF_NFC
 939	sa.raw.Dev_idx = sa.DeviceIdx
 940	sa.raw.Target_idx = sa.TargetIdx
 941	sa.raw.Nfc_protocol = sa.NFCProtocol
 942	return unsafe.Pointer(&sa.raw), SizeofSockaddrNFC, nil
 943}
 944
 945type SockaddrNFCLLCP struct {
 946	DeviceIdx      uint32
 947	TargetIdx      uint32
 948	NFCProtocol    uint32
 949	DestinationSAP uint8
 950	SourceSAP      uint8
 951	ServiceName    string
 952	raw            RawSockaddrNFCLLCP
 953}
 954
 955func (sa *SockaddrNFCLLCP) sockaddr() (unsafe.Pointer, _Socklen, error) {
 956	sa.raw.Sa_family = AF_NFC
 957	sa.raw.Dev_idx = sa.DeviceIdx
 958	sa.raw.Target_idx = sa.TargetIdx
 959	sa.raw.Nfc_protocol = sa.NFCProtocol
 960	sa.raw.Dsap = sa.DestinationSAP
 961	sa.raw.Ssap = sa.SourceSAP
 962	if len(sa.ServiceName) > len(sa.raw.Service_name) {
 963		return nil, 0, EINVAL
 964	}
 965	copy(sa.raw.Service_name[:], sa.ServiceName)
 966	sa.raw.SetServiceNameLen(len(sa.ServiceName))
 967	return unsafe.Pointer(&sa.raw), SizeofSockaddrNFCLLCP, nil
 968}
 969
 970var socketProtocol = func(fd int) (int, error) {
 971	return GetsockoptInt(fd, SOL_SOCKET, SO_PROTOCOL)
 972}
 973
 974func anyToSockaddr(fd int, rsa *RawSockaddrAny) (Sockaddr, error) {
 975	switch rsa.Addr.Family {
 976	case AF_NETLINK:
 977		pp := (*RawSockaddrNetlink)(unsafe.Pointer(rsa))
 978		sa := new(SockaddrNetlink)
 979		sa.Family = pp.Family
 980		sa.Pad = pp.Pad
 981		sa.Pid = pp.Pid
 982		sa.Groups = pp.Groups
 983		return sa, nil
 984
 985	case AF_PACKET:
 986		pp := (*RawSockaddrLinklayer)(unsafe.Pointer(rsa))
 987		sa := new(SockaddrLinklayer)
 988		sa.Protocol = pp.Protocol
 989		sa.Ifindex = int(pp.Ifindex)
 990		sa.Hatype = pp.Hatype
 991		sa.Pkttype = pp.Pkttype
 992		sa.Halen = pp.Halen
 993		sa.Addr = pp.Addr
 994		return sa, nil
 995
 996	case AF_UNIX:
 997		pp := (*RawSockaddrUnix)(unsafe.Pointer(rsa))
 998		sa := new(SockaddrUnix)
 999		if pp.Path[0] == 0 {
1000			// "Abstract" Unix domain socket.
1001			// Rewrite leading NUL as @ for textual display.
1002			// (This is the standard convention.)
1003			// Not friendly to overwrite in place,
1004			// but the callers below don't care.
1005			pp.Path[0] = '@'
1006		}
1007
1008		// Assume path ends at NUL.
1009		// This is not technically the Linux semantics for
1010		// abstract Unix domain sockets--they are supposed
1011		// to be uninterpreted fixed-size binary blobs--but
1012		// everyone uses this convention.
1013		n := 0
1014		for n < len(pp.Path) && pp.Path[n] != 0 {
1015			n++
1016		}
1017		sa.Name = string(unsafe.Slice((*byte)(unsafe.Pointer(&pp.Path[0])), n))
1018		return sa, nil
1019
1020	case AF_INET:
1021		proto, err := socketProtocol(fd)
1022		if err != nil {
1023			return nil, err
1024		}
1025
1026		switch proto {
1027		case IPPROTO_L2TP:
1028			pp := (*RawSockaddrL2TPIP)(unsafe.Pointer(rsa))
1029			sa := new(SockaddrL2TPIP)
1030			sa.ConnId = pp.Conn_id
1031			sa.Addr = pp.Addr
1032			return sa, nil
1033		default:
1034			pp := (*RawSockaddrInet4)(unsafe.Pointer(rsa))
1035			sa := new(SockaddrInet4)
1036			p := (*[2]byte)(unsafe.Pointer(&pp.Port))
1037			sa.Port = int(p[0])<<8 + int(p[1])
1038			sa.Addr = pp.Addr
1039			return sa, nil
1040		}
1041
1042	case AF_INET6:
1043		proto, err := socketProtocol(fd)
1044		if err != nil {
1045			return nil, err
1046		}
1047
1048		switch proto {
1049		case IPPROTO_L2TP:
1050			pp := (*RawSockaddrL2TPIP6)(unsafe.Pointer(rsa))
1051			sa := new(SockaddrL2TPIP6)
1052			sa.ConnId = pp.Conn_id
1053			sa.ZoneId = pp.Scope_id
1054			sa.Addr = pp.Addr
1055			return sa, nil
1056		default:
1057			pp := (*RawSockaddrInet6)(unsafe.Pointer(rsa))
1058			sa := new(SockaddrInet6)
1059			p := (*[2]byte)(unsafe.Pointer(&pp.Port))
1060			sa.Port = int(p[0])<<8 + int(p[1])
1061			sa.ZoneId = pp.Scope_id
1062			sa.Addr = pp.Addr
1063			return sa, nil
1064		}
1065
1066	case AF_VSOCK:
1067		pp := (*RawSockaddrVM)(unsafe.Pointer(rsa))
1068		sa := &SockaddrVM{
1069			CID:   pp.Cid,
1070			Port:  pp.Port,
1071			Flags: pp.Flags,
1072		}
1073		return sa, nil
1074	case AF_BLUETOOTH:
1075		proto, err := socketProtocol(fd)
1076		if err != nil {
1077			return nil, err
1078		}
1079		// only BTPROTO_L2CAP and BTPROTO_RFCOMM can accept connections
1080		switch proto {
1081		case BTPROTO_L2CAP:
1082			pp := (*RawSockaddrL2)(unsafe.Pointer(rsa))
1083			sa := &SockaddrL2{
1084				PSM:      pp.Psm,
1085				CID:      pp.Cid,
1086				Addr:     pp.Bdaddr,
1087				AddrType: pp.Bdaddr_type,
1088			}
1089			return sa, nil
1090		case BTPROTO_RFCOMM:
1091			pp := (*RawSockaddrRFCOMM)(unsafe.Pointer(rsa))
1092			sa := &SockaddrRFCOMM{
1093				Channel: pp.Channel,
1094				Addr:    pp.Bdaddr,
1095			}
1096			return sa, nil
1097		}
1098	case AF_XDP:
1099		pp := (*RawSockaddrXDP)(unsafe.Pointer(rsa))
1100		sa := &SockaddrXDP{
1101			Flags:        pp.Flags,
1102			Ifindex:      pp.Ifindex,
1103			QueueID:      pp.Queue_id,
1104			SharedUmemFD: pp.Shared_umem_fd,
1105		}
1106		return sa, nil
1107	case AF_PPPOX:
1108		pp := (*RawSockaddrPPPoX)(unsafe.Pointer(rsa))
1109		if binary.BigEndian.Uint32(pp[2:6]) != px_proto_oe {
1110			return nil, EINVAL
1111		}
1112		sa := &SockaddrPPPoE{
1113			SID:    binary.BigEndian.Uint16(pp[6:8]),
1114			Remote: pp[8:14],
1115		}
1116		for i := 14; i < 14+IFNAMSIZ; i++ {
1117			if pp[i] == 0 {
1118				sa.Dev = string(pp[14:i])
1119				break
1120			}
1121		}
1122		return sa, nil
1123	case AF_TIPC:
1124		pp := (*RawSockaddrTIPC)(unsafe.Pointer(rsa))
1125
1126		sa := &SockaddrTIPC{
1127			Scope: int(pp.Scope),
1128		}
1129
1130		// Determine which union variant is present in pp.Addr by checking
1131		// pp.Addrtype.
1132		switch pp.Addrtype {
1133		case TIPC_SERVICE_RANGE:
1134			sa.Addr = (*TIPCServiceRange)(unsafe.Pointer(&pp.Addr))
1135		case TIPC_SERVICE_ADDR:
1136			sa.Addr = (*TIPCServiceName)(unsafe.Pointer(&pp.Addr))
1137		case TIPC_SOCKET_ADDR:
1138			sa.Addr = (*TIPCSocketAddr)(unsafe.Pointer(&pp.Addr))
1139		default:
1140			return nil, EINVAL
1141		}
1142
1143		return sa, nil
1144	case AF_IUCV:
1145		pp := (*RawSockaddrIUCV)(unsafe.Pointer(rsa))
1146
1147		var user [8]byte
1148		var name [8]byte
1149
1150		for i := range 8 {
1151			user[i] = byte(pp.User_id[i])
1152			name[i] = byte(pp.Name[i])
1153		}
1154
1155		sa := &SockaddrIUCV{
1156			UserID: string(user[:]),
1157			Name:   string(name[:]),
1158		}
1159		return sa, nil
1160
1161	case AF_CAN:
1162		proto, err := socketProtocol(fd)
1163		if err != nil {
1164			return nil, err
1165		}
1166
1167		pp := (*RawSockaddrCAN)(unsafe.Pointer(rsa))
1168
1169		switch proto {
1170		case CAN_J1939:
1171			sa := &SockaddrCANJ1939{
1172				Ifindex: int(pp.Ifindex),
1173			}
1174			name := (*[8]byte)(unsafe.Pointer(&sa.Name))
1175			for i := range 8 {
1176				name[i] = pp.Addr[i]
1177			}
1178			pgn := (*[4]byte)(unsafe.Pointer(&sa.PGN))
1179			for i := range 4 {
1180				pgn[i] = pp.Addr[i+8]
1181			}
1182			addr := (*[1]byte)(unsafe.Pointer(&sa.Addr))
1183			addr[0] = pp.Addr[12]
1184			return sa, nil
1185		default:
1186			sa := &SockaddrCAN{
1187				Ifindex: int(pp.Ifindex),
1188			}
1189			rx := (*[4]byte)(unsafe.Pointer(&sa.RxID))
1190			for i := range 4 {
1191				rx[i] = pp.Addr[i]
1192			}
1193			tx := (*[4]byte)(unsafe.Pointer(&sa.TxID))
1194			for i := range 4 {
1195				tx[i] = pp.Addr[i+4]
1196			}
1197			return sa, nil
1198		}
1199	case AF_NFC:
1200		proto, err := socketProtocol(fd)
1201		if err != nil {
1202			return nil, err
1203		}
1204		switch proto {
1205		case NFC_SOCKPROTO_RAW:
1206			pp := (*RawSockaddrNFC)(unsafe.Pointer(rsa))
1207			sa := &SockaddrNFC{
1208				DeviceIdx:   pp.Dev_idx,
1209				TargetIdx:   pp.Target_idx,
1210				NFCProtocol: pp.Nfc_protocol,
1211			}
1212			return sa, nil
1213		case NFC_SOCKPROTO_LLCP:
1214			pp := (*RawSockaddrNFCLLCP)(unsafe.Pointer(rsa))
1215			if uint64(pp.Service_name_len) > uint64(len(pp.Service_name)) {
1216				return nil, EINVAL
1217			}
1218			sa := &SockaddrNFCLLCP{
1219				DeviceIdx:      pp.Dev_idx,
1220				TargetIdx:      pp.Target_idx,
1221				NFCProtocol:    pp.Nfc_protocol,
1222				DestinationSAP: pp.Dsap,
1223				SourceSAP:      pp.Ssap,
1224				ServiceName:    string(pp.Service_name[:pp.Service_name_len]),
1225			}
1226			return sa, nil
1227		default:
1228			return nil, EINVAL
1229		}
1230	}
1231	return nil, EAFNOSUPPORT
1232}
1233
1234func Accept(fd int) (nfd int, sa Sockaddr, err error) {
1235	var rsa RawSockaddrAny
1236	var len _Socklen = SizeofSockaddrAny
1237	nfd, err = accept4(fd, &rsa, &len, 0)
1238	if err != nil {
1239		return
1240	}
1241	sa, err = anyToSockaddr(fd, &rsa)
1242	if err != nil {
1243		Close(nfd)
1244		nfd = 0
1245	}
1246	return
1247}
1248
1249func Accept4(fd int, flags int) (nfd int, sa Sockaddr, err error) {
1250	var rsa RawSockaddrAny
1251	var len _Socklen = SizeofSockaddrAny
1252	nfd, err = accept4(fd, &rsa, &len, flags)
1253	if err != nil {
1254		return
1255	}
1256	if len > SizeofSockaddrAny {
1257		panic("RawSockaddrAny too small")
1258	}
1259	sa, err = anyToSockaddr(fd, &rsa)
1260	if err != nil {
1261		Close(nfd)
1262		nfd = 0
1263	}
1264	return
1265}
1266
1267func Getsockname(fd int) (sa Sockaddr, err error) {
1268	var rsa RawSockaddrAny
1269	var len _Socklen = SizeofSockaddrAny
1270	if err = getsockname(fd, &rsa, &len); err != nil {
1271		return
1272	}
1273	return anyToSockaddr(fd, &rsa)
1274}
1275
1276func GetsockoptIPMreqn(fd, level, opt int) (*IPMreqn, error) {
1277	var value IPMreqn
1278	vallen := _Socklen(SizeofIPMreqn)
1279	err := getsockopt(fd, level, opt, unsafe.Pointer(&value), &vallen)
1280	return &value, err
1281}
1282
1283func GetsockoptUcred(fd, level, opt int) (*Ucred, error) {
1284	var value Ucred
1285	vallen := _Socklen(SizeofUcred)
1286	err := getsockopt(fd, level, opt, unsafe.Pointer(&value), &vallen)
1287	return &value, err
1288}
1289
1290func GetsockoptTCPInfo(fd, level, opt int) (*TCPInfo, error) {
1291	var value TCPInfo
1292	vallen := _Socklen(SizeofTCPInfo)
1293	err := getsockopt(fd, level, opt, unsafe.Pointer(&value), &vallen)
1294	return &value, err
1295}
1296
1297// GetsockoptTCPCCVegasInfo returns algorithm specific congestion control information for a socket using the "vegas"
1298// algorithm.
1299//
1300// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
1301//
1302//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
1303func GetsockoptTCPCCVegasInfo(fd, level, opt int) (*TCPVegasInfo, error) {
1304	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
1305	vallen := _Socklen(SizeofTCPCCInfo)
1306	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
1307	out := (*TCPVegasInfo)(unsafe.Pointer(&value[0]))
1308	return out, err
1309}
1310
1311// GetsockoptTCPCCDCTCPInfo returns algorithm specific congestion control information for a socket using the "dctp"
1312// algorithm.
1313//
1314// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
1315//
1316//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
1317func GetsockoptTCPCCDCTCPInfo(fd, level, opt int) (*TCPDCTCPInfo, error) {
1318	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
1319	vallen := _Socklen(SizeofTCPCCInfo)
1320	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
1321	out := (*TCPDCTCPInfo)(unsafe.Pointer(&value[0]))
1322	return out, err
1323}
1324
1325// GetsockoptTCPCCBBRInfo returns algorithm specific congestion control information for a socket using the "bbr"
1326// algorithm.
1327//
1328// The socket's congestion control algorighm can be retrieved via [GetsockoptString] with the [TCP_CONGESTION] option:
1329//
1330//	algo, err := unix.GetsockoptString(fd, unix.IPPROTO_TCP, unix.TCP_CONGESTION)
1331func GetsockoptTCPCCBBRInfo(fd, level, opt int) (*TCPBBRInfo, error) {
1332	var value [SizeofTCPCCInfo / 4]uint32 // ensure proper alignment
1333	vallen := _Socklen(SizeofTCPCCInfo)
1334	err := getsockopt(fd, level, opt, unsafe.Pointer(&value[0]), &vallen)
1335	out := (*TCPBBRInfo)(unsafe.Pointer(&value[0]))
1336	return out, err
1337}
1338
1339// GetsockoptString returns the string value of the socket option opt for the
1340// socket associated with fd at the given socket level.
1341func GetsockoptString(fd, level, opt int) (string, error) {
1342	buf := make([]byte, 256)
1343	vallen := _Socklen(len(buf))
1344	err := getsockopt(fd, level, opt, unsafe.Pointer(&buf[0]), &vallen)
1345	if err != nil {
1346		if err == ERANGE {
1347			buf = make([]byte, vallen)
1348			err = getsockopt(fd, level, opt, unsafe.Pointer(&buf[0]), &vallen)
1349		}
1350		if err != nil {
1351			return "", err
1352		}
1353	}
1354	return ByteSliceToString(buf[:vallen]), nil
1355}
1356
1357func GetsockoptTpacketStats(fd, level, opt int) (*TpacketStats, error) {
1358	var value TpacketStats
1359	vallen := _Socklen(SizeofTpacketStats)
1360	err := getsockopt(fd, level, opt, unsafe.Pointer(&value), &vallen)
1361	return &value, err
1362}
1363
1364func GetsockoptTpacketStatsV3(fd, level, opt int) (*TpacketStatsV3, error) {
1365	var value TpacketStatsV3
1366	vallen := _Socklen(SizeofTpacketStatsV3)
1367	err := getsockopt(fd, level, opt, unsafe.Pointer(&value), &vallen)
1368	return &value, err
1369}
1370
1371func SetsockoptIPMreqn(fd, level, opt int, mreq *IPMreqn) (err error) {
1372	return setsockopt(fd, level, opt, unsafe.Pointer(mreq), unsafe.Sizeof(*mreq))
1373}
1374
1375func SetsockoptPacketMreq(fd, level, opt int, mreq *PacketMreq) error {
1376	return setsockopt(fd, level, opt, unsafe.Pointer(mreq), unsafe.Sizeof(*mreq))
1377}
1378
1379// SetsockoptSockFprog attaches a classic BPF or an extended BPF program to a
1380// socket to filter incoming packets.  See 'man 7 socket' for usage information.
1381func SetsockoptSockFprog(fd, level, opt int, fprog *SockFprog) error {
1382	return setsockopt(fd, level, opt, unsafe.Pointer(fprog), unsafe.Sizeof(*fprog))
1383}
1384
1385func SetsockoptCanRawFilter(fd, level, opt int, filter []CanFilter) error {
1386	var p unsafe.Pointer
1387	if len(filter) > 0 {
1388		p = unsafe.Pointer(&filter[0])
1389	}
1390	return setsockopt(fd, level, opt, p, uintptr(len(filter)*SizeofCanFilter))
1391}
1392
1393func SetsockoptTpacketReq(fd, level, opt int, tp *TpacketReq) error {
1394	return setsockopt(fd, level, opt, unsafe.Pointer(tp), unsafe.Sizeof(*tp))
1395}
1396
1397func SetsockoptTpacketReq3(fd, level, opt int, tp *TpacketReq3) error {
1398	return setsockopt(fd, level, opt, unsafe.Pointer(tp), unsafe.Sizeof(*tp))
1399}
1400
1401func SetsockoptTCPRepairOpt(fd, level, opt int, o []TCPRepairOpt) (err error) {
1402	if len(o) == 0 {
1403		return EINVAL
1404	}
1405	return setsockopt(fd, level, opt, unsafe.Pointer(&o[0]), uintptr(SizeofTCPRepairOpt*len(o)))
1406}
1407
1408func SetsockoptTCPMD5Sig(fd, level, opt int, s *TCPMD5Sig) error {
1409	return setsockopt(fd, level, opt, unsafe.Pointer(s), unsafe.Sizeof(*s))
1410}
1411
1412// Keyctl Commands (http://man7.org/linux/man-pages/man2/keyctl.2.html)
1413
1414// KeyctlInt calls keyctl commands in which each argument is an int.
1415// These commands are KEYCTL_REVOKE, KEYCTL_CHOWN, KEYCTL_CLEAR, KEYCTL_LINK,
1416// KEYCTL_UNLINK, KEYCTL_NEGATE, KEYCTL_SET_REQKEY_KEYRING, KEYCTL_SET_TIMEOUT,
1417// KEYCTL_ASSUME_AUTHORITY, KEYCTL_SESSION_TO_PARENT, KEYCTL_REJECT,
1418// KEYCTL_INVALIDATE, and KEYCTL_GET_PERSISTENT.
1419//sys	KeyctlInt(cmd int, arg2 int, arg3 int, arg4 int, arg5 int) (ret int, err error) = SYS_KEYCTL
1420
1421// KeyctlBuffer calls keyctl commands in which the third and fourth
1422// arguments are a buffer and its length, respectively.
1423// These commands are KEYCTL_UPDATE, KEYCTL_READ, and KEYCTL_INSTANTIATE.
1424//sys	KeyctlBuffer(cmd int, arg2 int, buf []byte, arg5 int) (ret int, err error) = SYS_KEYCTL
1425
1426// KeyctlString calls keyctl commands which return a string.
1427// These commands are KEYCTL_DESCRIBE and KEYCTL_GET_SECURITY.
1428func KeyctlString(cmd int, id int) (string, error) {
1429	// We must loop as the string data may change in between the syscalls.
1430	// We could allocate a large buffer here to reduce the chance that the
1431	// syscall needs to be called twice; however, this is unnecessary as
1432	// the performance loss is negligible.
1433	var buffer []byte
1434	for {
1435		// Try to fill the buffer with data
1436		length, err := KeyctlBuffer(cmd, id, buffer, 0)
1437		if err != nil {
1438			return "", err
1439		}
1440
1441		// Check if the data was written
1442		if length <= len(buffer) {
1443			// Exclude the null terminator
1444			return string(buffer[:length-1]), nil
1445		}
1446
1447		// Make a bigger buffer if needed
1448		buffer = make([]byte, length)
1449	}
1450}
1451
1452// Keyctl commands with special signatures.
1453
1454// KeyctlGetKeyringID implements the KEYCTL_GET_KEYRING_ID command.
1455// See the full documentation at:
1456// http://man7.org/linux/man-pages/man3/keyctl_get_keyring_ID.3.html
1457func KeyctlGetKeyringID(id int, create bool) (ringid int, err error) {
1458	createInt := 0
1459	if create {
1460		createInt = 1
1461	}
1462	return KeyctlInt(KEYCTL_GET_KEYRING_ID, id, createInt, 0, 0)
1463}
1464
1465// KeyctlSetperm implements the KEYCTL_SETPERM command. The perm value is the
1466// key handle permission mask as described in the "keyctl setperm" section of
1467// http://man7.org/linux/man-pages/man1/keyctl.1.html.
1468// See the full documentation at:
1469// http://man7.org/linux/man-pages/man3/keyctl_setperm.3.html
1470func KeyctlSetperm(id int, perm uint32) error {
1471	_, err := KeyctlInt(KEYCTL_SETPERM, id, int(perm), 0, 0)
1472	return err
1473}
1474
1475//sys	keyctlJoin(cmd int, arg2 string) (ret int, err error) = SYS_KEYCTL
1476
1477// KeyctlJoinSessionKeyring implements the KEYCTL_JOIN_SESSION_KEYRING command.
1478// See the full documentation at:
1479// http://man7.org/linux/man-pages/man3/keyctl_join_session_keyring.3.html
1480func KeyctlJoinSessionKeyring(name string) (ringid int, err error) {
1481	return keyctlJoin(KEYCTL_JOIN_SESSION_KEYRING, name)
1482}
1483
1484//sys	keyctlSearch(cmd int, arg2 int, arg3 string, arg4 string, arg5 int) (ret int, err error) = SYS_KEYCTL
1485
1486// KeyctlSearch implements the KEYCTL_SEARCH command.
1487// See the full documentation at:
1488// http://man7.org/linux/man-pages/man3/keyctl_search.3.html
1489func KeyctlSearch(ringid int, keyType, description string, destRingid int) (id int, err error) {
1490	return keyctlSearch(KEYCTL_SEARCH, ringid, keyType, description, destRingid)
1491}
1492
1493//sys	keyctlIOV(cmd int, arg2 int, payload []Iovec, arg5 int) (err error) = SYS_KEYCTL
1494
1495// KeyctlInstantiateIOV implements the KEYCTL_INSTANTIATE_IOV command. This
1496// command is similar to KEYCTL_INSTANTIATE, except that the payload is a slice
1497// of Iovec (each of which represents a buffer) instead of a single buffer.
1498// See the full documentation at:
1499// http://man7.org/linux/man-pages/man3/keyctl_instantiate_iov.3.html
1500func KeyctlInstantiateIOV(id int, payload []Iovec, ringid int) error {
1501	return keyctlIOV(KEYCTL_INSTANTIATE_IOV, id, payload, ringid)
1502}
1503
1504//sys	keyctlDH(cmd int, arg2 *KeyctlDHParams, buf []byte) (ret int, err error) = SYS_KEYCTL
1505
1506// KeyctlDHCompute implements the KEYCTL_DH_COMPUTE command. This command
1507// computes a Diffie-Hellman shared secret based on the provide params. The
1508// secret is written to the provided buffer and the returned size is the number
1509// of bytes written (returning an error if there is insufficient space in the
1510// buffer). If a nil buffer is passed in, this function returns the minimum
1511// buffer length needed to store the appropriate data. Note that this differs
1512// from KEYCTL_READ's behavior which always returns the requested payload size.
1513// See the full documentation at:
1514// http://man7.org/linux/man-pages/man3/keyctl_dh_compute.3.html
1515func KeyctlDHCompute(params *KeyctlDHParams, buffer []byte) (size int, err error) {
1516	return keyctlDH(KEYCTL_DH_COMPUTE, params, buffer)
1517}
1518
1519// KeyctlRestrictKeyring implements the KEYCTL_RESTRICT_KEYRING command. This
1520// command limits the set of keys that can be linked to the keyring, regardless
1521// of keyring permissions. The command requires the "setattr" permission.
1522//
1523// When called with an empty keyType the command locks the keyring, preventing
1524// any further keys from being linked to the keyring.
1525//
1526// The "asymmetric" keyType defines restrictions requiring key payloads to be
1527// DER encoded X.509 certificates signed by keys in another keyring. Restrictions
1528// for "asymmetric" include "builtin_trusted", "builtin_and_secondary_trusted",
1529// "key_or_keyring:<key>", and "key_or_keyring:<key>:chain".
1530//
1531// As of Linux 4.12, only the "asymmetric" keyType defines type-specific
1532// restrictions.
1533//
1534// See the full documentation at:
1535// http://man7.org/linux/man-pages/man3/keyctl_restrict_keyring.3.html
1536// http://man7.org/linux/man-pages/man2/keyctl.2.html
1537func KeyctlRestrictKeyring(ringid int, keyType string, restriction string) error {
1538	if keyType == "" {
1539		return keyctlRestrictKeyring(KEYCTL_RESTRICT_KEYRING, ringid)
1540	}
1541	return keyctlRestrictKeyringByType(KEYCTL_RESTRICT_KEYRING, ringid, keyType, restriction)
1542}
1543
1544//sys	keyctlRestrictKeyringByType(cmd int, arg2 int, keyType string, restriction string) (err error) = SYS_KEYCTL
1545//sys	keyctlRestrictKeyring(cmd int, arg2 int) (err error) = SYS_KEYCTL
1546
1547func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) {
1548	var msg Msghdr
1549	msg.Name = (*byte)(unsafe.Pointer(rsa))
1550	msg.Namelen = uint32(SizeofSockaddrAny)
1551	var dummy byte
1552	if len(oob) > 0 {
1553		if emptyIovecs(iov) {
1554			var sockType int
1555			sockType, err = GetsockoptInt(fd, SOL_SOCKET, SO_TYPE)
1556			if err != nil {
1557				return
1558			}
1559			// receive at least one normal byte
1560			if sockType != SOCK_DGRAM {
1561				var iova [1]Iovec
1562				iova[0].Base = &dummy
1563				iova[0].SetLen(1)
1564				iov = iova[:]
1565			}
1566		}
1567		msg.Control = &oob[0]
1568		msg.SetControllen(len(oob))
1569	}
1570	if len(iov) > 0 {
1571		msg.Iov = &iov[0]
1572		msg.SetIovlen(len(iov))
1573	}
1574	if n, err = recvmsg(fd, &msg, flags); err != nil {
1575		return
1576	}
1577	oobn = int(msg.Controllen)
1578	recvflags = int(msg.Flags)
1579	return
1580}
1581
1582func sendmsgN(fd int, iov []Iovec, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) {
1583	var msg Msghdr
1584	msg.Name = (*byte)(ptr)
1585	msg.Namelen = uint32(salen)
1586	var dummy byte
1587	var empty bool
1588	if len(oob) > 0 {
1589		empty = emptyIovecs(iov)
1590		if empty {
1591			var sockType int
1592			sockType, err = GetsockoptInt(fd, SOL_SOCKET, SO_TYPE)
1593			if err != nil {
1594				return 0, err
1595			}
1596			// send at least one normal byte
1597			if sockType != SOCK_DGRAM {
1598				var iova [1]Iovec
1599				iova[0].Base = &dummy
1600				iova[0].SetLen(1)
1601				iov = iova[:]
1602			}
1603		}
1604		msg.Control = &oob[0]
1605		msg.SetControllen(len(oob))
1606	}
1607	if len(iov) > 0 {
1608		msg.Iov = &iov[0]
1609		msg.SetIovlen(len(iov))
1610	}
1611	if n, err = sendmsg(fd, &msg, flags); err != nil {
1612		return 0, err
1613	}
1614	if len(oob) > 0 && empty {
1615		n = 0
1616	}
1617	return n, nil
1618}
1619
1620// BindToDevice binds the socket associated with fd to device.
1621func BindToDevice(fd int, device string) (err error) {
1622	return SetsockoptString(fd, SOL_SOCKET, SO_BINDTODEVICE, device)
1623}
1624
1625//sys	ptrace(request int, pid int, addr uintptr, data uintptr) (err error)
1626//sys	ptracePtr(request int, pid int, addr uintptr, data unsafe.Pointer) (err error) = SYS_PTRACE
1627
1628func ptracePeek(req int, pid int, addr uintptr, out []byte) (count int, err error) {
1629	// The peek requests are machine-size oriented, so we wrap it
1630	// to retrieve arbitrary-length data.
1631
1632	// The ptrace syscall differs from glibc's ptrace.
1633	// Peeks returns the word in *data, not as the return value.
1634
1635	var buf [SizeofPtr]byte
1636
1637	// Leading edge. PEEKTEXT/PEEKDATA don't require aligned
1638	// access (PEEKUSER warns that it might), but if we don't
1639	// align our reads, we might straddle an unmapped page
1640	// boundary and not get the bytes leading up to the page
1641	// boundary.
1642	n := 0
1643	if addr%SizeofPtr != 0 {
1644		err = ptracePtr(req, pid, addr-addr%SizeofPtr, unsafe.Pointer(&buf[0]))
1645		if err != nil {
1646			return 0, err
1647		}
1648		n += copy(out, buf[addr%SizeofPtr:])
1649		out = out[n:]
1650	}
1651
1652	// Remainder.
1653	for len(out) > 0 {
1654		// We use an internal buffer to guarantee alignment.
1655		// It's not documented if this is necessary, but we're paranoid.
1656		err = ptracePtr(req, pid, addr+uintptr(n), unsafe.Pointer(&buf[0]))
1657		if err != nil {
1658			return n, err
1659		}
1660		copied := copy(out, buf[0:])
1661		n += copied
1662		out = out[copied:]
1663	}
1664
1665	return n, nil
1666}
1667
1668func PtracePeekText(pid int, addr uintptr, out []byte) (count int, err error) {
1669	return ptracePeek(PTRACE_PEEKTEXT, pid, addr, out)
1670}
1671
1672func PtracePeekData(pid int, addr uintptr, out []byte) (count int, err error) {
1673	return ptracePeek(PTRACE_PEEKDATA, pid, addr, out)
1674}
1675
1676func PtracePeekUser(pid int, addr uintptr, out []byte) (count int, err error) {
1677	return ptracePeek(PTRACE_PEEKUSR, pid, addr, out)
1678}
1679
1680func ptracePoke(pokeReq int, peekReq int, pid int, addr uintptr, data []byte) (count int, err error) {
1681	// As for ptracePeek, we need to align our accesses to deal
1682	// with the possibility of straddling an invalid page.
1683
1684	// Leading edge.
1685	n := 0
1686	if addr%SizeofPtr != 0 {
1687		var buf [SizeofPtr]byte
1688		err = ptracePtr(peekReq, pid, addr-addr%SizeofPtr, unsafe.Pointer(&buf[0]))
1689		if err != nil {
1690			return 0, err
1691		}
1692		n += copy(buf[addr%SizeofPtr:], data)
1693		word := *((*uintptr)(unsafe.Pointer(&buf[0])))
1694		err = ptrace(pokeReq, pid, addr-addr%SizeofPtr, word)
1695		if err != nil {
1696			return 0, err
1697		}
1698		data = data[n:]
1699	}
1700
1701	// Interior.
1702	for len(data) > SizeofPtr {
1703		word := *((*uintptr)(unsafe.Pointer(&data[0])))
1704		err = ptrace(pokeReq, pid, addr+uintptr(n), word)
1705		if err != nil {
1706			return n, err
1707		}
1708		n += SizeofPtr
1709		data = data[SizeofPtr:]
1710	}
1711
1712	// Trailing edge.
1713	if len(data) > 0 {
1714		var buf [SizeofPtr]byte
1715		err = ptracePtr(peekReq, pid, addr+uintptr(n), unsafe.Pointer(&buf[0]))
1716		if err != nil {
1717			return n, err
1718		}
1719		copy(buf[0:], data)
1720		word := *((*uintptr)(unsafe.Pointer(&buf[0])))
1721		err = ptrace(pokeReq, pid, addr+uintptr(n), word)
1722		if err != nil {
1723			return n, err
1724		}
1725		n += len(data)
1726	}
1727
1728	return n, nil
1729}
1730
1731func PtracePokeText(pid int, addr uintptr, data []byte) (count int, err error) {
1732	return ptracePoke(PTRACE_POKETEXT, PTRACE_PEEKTEXT, pid, addr, data)
1733}
1734
1735func PtracePokeData(pid int, addr uintptr, data []byte) (count int, err error) {
1736	return ptracePoke(PTRACE_POKEDATA, PTRACE_PEEKDATA, pid, addr, data)
1737}
1738
1739func PtracePokeUser(pid int, addr uintptr, data []byte) (count int, err error) {
1740	return ptracePoke(PTRACE_POKEUSR, PTRACE_PEEKUSR, pid, addr, data)
1741}
1742
1743// elfNT_PRSTATUS is a copy of the debug/elf.NT_PRSTATUS constant so
1744// x/sys/unix doesn't need to depend on debug/elf and thus
1745// compress/zlib, debug/dwarf, and other packages.
1746const elfNT_PRSTATUS = 1
1747
1748func PtraceGetRegs(pid int, regsout *PtraceRegs) (err error) {
1749	var iov Iovec
1750	iov.Base = (*byte)(unsafe.Pointer(regsout))
1751	iov.SetLen(int(unsafe.Sizeof(*regsout)))
1752	return ptracePtr(PTRACE_GETREGSET, pid, uintptr(elfNT_PRSTATUS), unsafe.Pointer(&iov))
1753}
1754
1755func PtraceSetRegs(pid int, regs *PtraceRegs) (err error) {
1756	var iov Iovec
1757	iov.Base = (*byte)(unsafe.Pointer(regs))
1758	iov.SetLen(int(unsafe.Sizeof(*regs)))
1759	return ptracePtr(PTRACE_SETREGSET, pid, uintptr(elfNT_PRSTATUS), unsafe.Pointer(&iov))
1760}
1761
1762func PtraceSetOptions(pid int, options int) (err error) {
1763	return ptrace(PTRACE_SETOPTIONS, pid, 0, uintptr(options))
1764}
1765
1766func PtraceGetEventMsg(pid int) (msg uint, err error) {
1767	var data _C_long
1768	err = ptracePtr(PTRACE_GETEVENTMSG, pid, 0, unsafe.Pointer(&data))
1769	msg = uint(data)
1770	return
1771}
1772
1773func PtraceCont(pid int, signal int) (err error) {
1774	return ptrace(PTRACE_CONT, pid, 0, uintptr(signal))
1775}
1776
1777func PtraceSyscall(pid int, signal int) (err error) {
1778	return ptrace(PTRACE_SYSCALL, pid, 0, uintptr(signal))
1779}
1780
1781func PtraceSingleStep(pid int) (err error) { return ptrace(PTRACE_SINGLESTEP, pid, 0, 0) }
1782
1783func PtraceInterrupt(pid int) (err error) { return ptrace(PTRACE_INTERRUPT, pid, 0, 0) }
1784
1785func PtraceAttach(pid int) (err error) { return ptrace(PTRACE_ATTACH, pid, 0, 0) }
1786
1787func PtraceSeize(pid int) (err error) { return ptrace(PTRACE_SEIZE, pid, 0, 0) }
1788
1789func PtraceDetach(pid int) (err error) { return ptrace(PTRACE_DETACH, pid, 0, 0) }
1790
1791//sys	reboot(magic1 uint, magic2 uint, cmd int, arg string) (err error)
1792
1793func Reboot(cmd int) (err error) {
1794	return reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, "")
1795}
1796
1797func direntIno(buf []byte) (uint64, bool) {
1798	return readInt(buf, unsafe.Offsetof(Dirent{}.Ino), unsafe.Sizeof(Dirent{}.Ino))
1799}
1800
1801func direntReclen(buf []byte) (uint64, bool) {
1802	return readInt(buf, unsafe.Offsetof(Dirent{}.Reclen), unsafe.Sizeof(Dirent{}.Reclen))
1803}
1804
1805func direntNamlen(buf []byte) (uint64, bool) {
1806	reclen, ok := direntReclen(buf)
1807	if !ok {
1808		return 0, false
1809	}
1810	return reclen - uint64(unsafe.Offsetof(Dirent{}.Name)), true
1811}
1812
1813//sys	mount(source string, target string, fstype string, flags uintptr, data *byte) (err error)
1814
1815func Mount(source string, target string, fstype string, flags uintptr, data string) (err error) {
1816	// Certain file systems get rather angry and EINVAL if you give
1817	// them an empty string of data, rather than NULL.
1818	if data == "" {
1819		return mount(source, target, fstype, flags, nil)
1820	}
1821	datap, err := BytePtrFromString(data)
1822	if err != nil {
1823		return err
1824	}
1825	return mount(source, target, fstype, flags, datap)
1826}
1827
1828//sys	mountSetattr(dirfd int, pathname string, flags uint, attr *MountAttr, size uintptr) (err error) = SYS_MOUNT_SETATTR
1829
1830// MountSetattr is a wrapper for mount_setattr(2).
1831// https://man7.org/linux/man-pages/man2/mount_setattr.2.html
1832//
1833// Requires kernel >= 5.12.
1834func MountSetattr(dirfd int, pathname string, flags uint, attr *MountAttr) error {
1835	return mountSetattr(dirfd, pathname, flags, attr, unsafe.Sizeof(*attr))
1836}
1837
1838func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) {
1839	if raceenabled {
1840		raceReleaseMerge(unsafe.Pointer(&ioSync))
1841	}
1842	return sendfile(outfd, infd, offset, count)
1843}
1844
1845// Sendto
1846// Recvfrom
1847// Socketpair
1848
1849/*
1850 * Direct access
1851 */
1852//sys	Acct(path string) (err error)
1853//sys	AddKey(keyType string, description string, payload []byte, ringid int) (id int, err error)
1854//sys	Adjtimex(buf *Timex) (state int, err error)
1855//sysnb	Capget(hdr *CapUserHeader, data *CapUserData) (err error)
1856//sysnb	Capset(hdr *CapUserHeader, data *CapUserData) (err error)
1857//sys	Chdir(path string) (err error)
1858//sys	Chroot(path string) (err error)
1859//sys	ClockAdjtime(clockid int32, buf *Timex) (state int, err error)
1860//sys	ClockGetres(clockid int32, res *Timespec) (err error)
1861//sys	ClockGettime(clockid int32, time *Timespec) (err error)
1862//sys	ClockSettime(clockid int32, time *Timespec) (err error)
1863//sys	ClockNanosleep(clockid int32, flags int, request *Timespec, remain *Timespec) (err error)
1864//sys	Close(fd int) (err error)
1865//sys	CloseRange(first uint, last uint, flags uint) (err error)
1866//sys	CopyFileRange(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int, err error)
1867//sys	DeleteModule(name string, flags int) (err error)
1868//sys	Dup(oldfd int) (fd int, err error)
1869
1870func Dup2(oldfd, newfd int) error {
1871	return Dup3(oldfd, newfd, 0)
1872}
1873
1874//sys	Dup3(oldfd int, newfd int, flags int) (err error)
1875//sysnb	EpollCreate1(flag int) (fd int, err error)
1876//sysnb	EpollCtl(epfd int, op int, fd int, event *EpollEvent) (err error)
1877//sys	Eventfd(initval uint, flags int) (fd int, err error) = SYS_EVENTFD2
1878//sys	Exit(code int) = SYS_EXIT_GROUP
1879//sys	Fallocate(fd int, mode uint32, off int64, len int64) (err error)
1880//sys	Fchdir(fd int) (err error)
1881//sys	Fchmod(fd int, mode uint32) (err error)
1882//sys	Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error)
1883//sys	Fdatasync(fd int) (err error)
1884//sys	Fgetxattr(fd int, attr string, dest []byte) (sz int, err error)
1885//sys	FinitModule(fd int, params string, flags int) (err error)
1886//sys	Flistxattr(fd int, dest []byte) (sz int, err error)
1887//sys	Flock(fd int, how int) (err error)
1888//sys	Fremovexattr(fd int, attr string) (err error)
1889//sys	Fsetxattr(fd int, attr string, dest []byte, flags int) (err error)
1890//sys	Fsync(fd int) (err error)
1891//sys	Fsmount(fd int, flags int, mountAttrs int) (fsfd int, err error)
1892//sys	Fsopen(fsName string, flags int) (fd int, err error)
1893//sys	Fspick(dirfd int, pathName string, flags int) (fd int, err error)
1894
1895//sys	fsconfig(fd int, cmd uint, key *byte, value *byte, aux int) (err error)
1896
1897func fsconfigCommon(fd int, cmd uint, key string, value *byte, aux int) (err error) {
1898	var keyp *byte
1899	if keyp, err = BytePtrFromString(key); err != nil {
1900		return
1901	}
1902	return fsconfig(fd, cmd, keyp, value, aux)
1903}
1904
1905// FsconfigSetFlag is equivalent to fsconfig(2) called
1906// with cmd == FSCONFIG_SET_FLAG.
1907//
1908// fd is the filesystem context to act upon.
1909// key the parameter key to set.
1910func FsconfigSetFlag(fd int, key string) (err error) {
1911	return fsconfigCommon(fd, FSCONFIG_SET_FLAG, key, nil, 0)
1912}
1913
1914// FsconfigSetString is equivalent to fsconfig(2) called
1915// with cmd == FSCONFIG_SET_STRING.
1916//
1917// fd is the filesystem context to act upon.
1918// key the parameter key to set.
1919// value is the parameter value to set.
1920func FsconfigSetString(fd int, key string, value string) (err error) {
1921	var valuep *byte
1922	if valuep, err = BytePtrFromString(value); err != nil {
1923		return
1924	}
1925	return fsconfigCommon(fd, FSCONFIG_SET_STRING, key, valuep, 0)
1926}
1927
1928// FsconfigSetBinary is equivalent to fsconfig(2) called
1929// with cmd == FSCONFIG_SET_BINARY.
1930//
1931// fd is the filesystem context to act upon.
1932// key the parameter key to set.
1933// value is the parameter value to set.
1934func FsconfigSetBinary(fd int, key string, value []byte) (err error) {
1935	if len(value) == 0 {
1936		return EINVAL
1937	}
1938	return fsconfigCommon(fd, FSCONFIG_SET_BINARY, key, &value[0], len(value))
1939}
1940
1941// FsconfigSetPath is equivalent to fsconfig(2) called
1942// with cmd == FSCONFIG_SET_PATH.
1943//
1944// fd is the filesystem context to act upon.
1945// key the parameter key to set.
1946// path is a non-empty path for specified key.
1947// atfd is a file descriptor at which to start lookup from or AT_FDCWD.
1948func FsconfigSetPath(fd int, key string, path string, atfd int) (err error) {
1949	var valuep *byte
1950	if valuep, err = BytePtrFromString(path); err != nil {
1951		return
1952	}
1953	return fsconfigCommon(fd, FSCONFIG_SET_PATH, key, valuep, atfd)
1954}
1955
1956// FsconfigSetPathEmpty is equivalent to fsconfig(2) called
1957// with cmd == FSCONFIG_SET_PATH_EMPTY. The same as
1958// FconfigSetPath but with AT_PATH_EMPTY implied.
1959func FsconfigSetPathEmpty(fd int, key string, path string, atfd int) (err error) {
1960	var valuep *byte
1961	if valuep, err = BytePtrFromString(path); err != nil {
1962		return
1963	}
1964	return fsconfigCommon(fd, FSCONFIG_SET_PATH_EMPTY, key, valuep, atfd)
1965}
1966
1967// FsconfigSetFd is equivalent to fsconfig(2) called
1968// with cmd == FSCONFIG_SET_FD.
1969//
1970// fd is the filesystem context to act upon.
1971// key the parameter key to set.
1972// value is a file descriptor to be assigned to specified key.
1973func FsconfigSetFd(fd int, key string, value int) (err error) {
1974	return fsconfigCommon(fd, FSCONFIG_SET_FD, key, nil, value)
1975}
1976
1977// FsconfigCreate is equivalent to fsconfig(2) called
1978// with cmd == FSCONFIG_CMD_CREATE.
1979//
1980// fd is the filesystem context to act upon.
1981func FsconfigCreate(fd int) (err error) {
1982	return fsconfig(fd, FSCONFIG_CMD_CREATE, nil, nil, 0)
1983}
1984
1985// FsconfigReconfigure is equivalent to fsconfig(2) called
1986// with cmd == FSCONFIG_CMD_RECONFIGURE.
1987//
1988// fd is the filesystem context to act upon.
1989func FsconfigReconfigure(fd int) (err error) {
1990	return fsconfig(fd, FSCONFIG_CMD_RECONFIGURE, nil, nil, 0)
1991}
1992
1993//sys	Getdents(fd int, buf []byte) (n int, err error) = SYS_GETDENTS64
1994//sysnb	Getpgid(pid int) (pgid int, err error)
1995
1996func Getpgrp() (pid int) {
1997	pid, _ = Getpgid(0)
1998	return
1999}
2000
2001//sysnb	Getpid() (pid int)
2002//sysnb	Getppid() (ppid int)
2003//sys	Getpriority(which int, who int) (prio int, err error)
2004
2005func Getrandom(buf []byte, flags int) (n int, err error) {
2006	vdsoRet, supported := vgetrandom(buf, uint32(flags))
2007	if supported {
2008		if vdsoRet < 0 {
2009			return 0, errnoErr(syscall.Errno(-vdsoRet))
2010		}
2011		return vdsoRet, nil
2012	}
2013	var p *byte
2014	if len(buf) > 0 {
2015		p = &buf[0]
2016	}
2017	r, _, e := Syscall(SYS_GETRANDOM, uintptr(unsafe.Pointer(p)), uintptr(len(buf)), uintptr(flags))
2018	if e != 0 {
2019		return 0, errnoErr(e)
2020	}
2021	return int(r), nil
2022}
2023
2024//sysnb	Getrusage(who int, rusage *Rusage) (err error)
2025//sysnb	Getsid(pid int) (sid int, err error)
2026//sysnb	Gettid() (tid int)
2027//sys	Getxattr(path string, attr string, dest []byte) (sz int, err error)
2028//sys	InitModule(moduleImage []byte, params string) (err error)
2029//sys	InotifyAddWatch(fd int, pathname string, mask uint32) (watchdesc int, err error)
2030//sysnb	InotifyInit1(flags int) (fd int, err error)
2031//sysnb	InotifyRmWatch(fd int, watchdesc uint32) (success int, err error)
2032//sysnb	Kill(pid int, sig syscall.Signal) (err error)
2033//sys	Klogctl(typ int, buf []byte) (n int, err error) = SYS_SYSLOG
2034//sys	Lgetxattr(path string, attr string, dest []byte) (sz int, err error)
2035//sys	Listxattr(path string, dest []byte) (sz int, err error)
2036//sys	Llistxattr(path string, dest []byte) (sz int, err error)
2037//sys	Lremovexattr(path string, attr string) (err error)
2038//sys	Lsetxattr(path string, attr string, data []byte, flags int) (err error)
2039//sys	MemfdCreate(name string, flags int) (fd int, err error)
2040//sys	Mkdirat(dirfd int, path string, mode uint32) (err error)
2041//sys	Mknodat(dirfd int, path string, mode uint32, dev int) (err error)
2042//sys	MoveMount(fromDirfd int, fromPathName string, toDirfd int, toPathName string, flags int) (err error)
2043//sys	Nanosleep(time *Timespec, leftover *Timespec) (err error)
2044//sys	OpenTree(dfd int, fileName string, flags uint) (r int, err error)
2045//sys	PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error)
2046//sys	PivotRoot(newroot string, putold string) (err error) = SYS_PIVOT_ROOT
2047//sys	Prctl(option int, arg2 uintptr, arg3 uintptr, arg4 uintptr, arg5 uintptr) (err error)
2048//sys	pselect6(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timespec, sigmask *sigset_argpack) (n int, err error)
2049//sys	read(fd int, p []byte) (n int, err error)
2050//sys	Removexattr(path string, attr string) (err error)
2051//sys	Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) (err error)
2052//sys	RequestKey(keyType string, description string, callback string, destRingid int) (id int, err error)
2053//sys	Setdomainname(p []byte) (err error)
2054//sys	Sethostname(p []byte) (err error)
2055//sysnb	Setpgid(pid int, pgid int) (err error)
2056//sysnb	Setsid() (pid int, err error)
2057//sysnb	Settimeofday(tv *Timeval) (err error)
2058//sys	Setns(fd int, nstype int) (err error)
2059
2060//go:linkname syscall_prlimit syscall.prlimit
2061func syscall_prlimit(pid, resource int, newlimit, old *syscall.Rlimit) error
2062
2063func Prlimit(pid, resource int, newlimit, old *Rlimit) error {
2064	// Just call the syscall version, because as of Go 1.21
2065	// it will affect starting a new process.
2066	return syscall_prlimit(pid, resource, (*syscall.Rlimit)(newlimit), (*syscall.Rlimit)(old))
2067}
2068
2069// PrctlRetInt performs a prctl operation specified by option and further
2070// optional arguments arg2 through arg5 depending on option. It returns a
2071// non-negative integer that is returned by the prctl syscall.
2072func PrctlRetInt(option int, arg2 uintptr, arg3 uintptr, arg4 uintptr, arg5 uintptr) (int, error) {
2073	ret, _, err := Syscall6(SYS_PRCTL, uintptr(option), uintptr(arg2), uintptr(arg3), uintptr(arg4), uintptr(arg5), 0)
2074	if err != 0 {
2075		return 0, err
2076	}
2077	return int(ret), nil
2078}
2079
2080func Setuid(uid int) (err error) {
2081	return syscall.Setuid(uid)
2082}
2083
2084func Setgid(gid int) (err error) {
2085	return syscall.Setgid(gid)
2086}
2087
2088func Setreuid(ruid, euid int) (err error) {
2089	return syscall.Setreuid(ruid, euid)
2090}
2091
2092func Setregid(rgid, egid int) (err error) {
2093	return syscall.Setregid(rgid, egid)
2094}
2095
2096func Setresuid(ruid, euid, suid int) (err error) {
2097	return syscall.Setresuid(ruid, euid, suid)
2098}
2099
2100func Setresgid(rgid, egid, sgid int) (err error) {
2101	return syscall.Setresgid(rgid, egid, sgid)
2102}
2103
2104// SetfsgidRetGid sets fsgid for current thread and returns previous fsgid set.
2105// setfsgid(2) will return a non-nil error only if its caller lacks CAP_SETUID capability.
2106// If the call fails due to other reasons, current fsgid will be returned.
2107func SetfsgidRetGid(gid int) (int, error) {
2108	return setfsgid(gid)
2109}
2110
2111// SetfsuidRetUid sets fsuid for current thread and returns previous fsuid set.
2112// setfsgid(2) will return a non-nil error only if its caller lacks CAP_SETUID capability
2113// If the call fails due to other reasons, current fsuid will be returned.
2114func SetfsuidRetUid(uid int) (int, error) {
2115	return setfsuid(uid)
2116}
2117
2118func Setfsgid(gid int) error {
2119	_, err := setfsgid(gid)
2120	return err
2121}
2122
2123func Setfsuid(uid int) error {
2124	_, err := setfsuid(uid)
2125	return err
2126}
2127
2128func Signalfd(fd int, sigmask *Sigset_t, flags int) (newfd int, err error) {
2129	return signalfd(fd, sigmask, _C__NSIG/8, flags)
2130}
2131
2132//sys	Setpriority(which int, who int, prio int) (err error)
2133//sys	Setxattr(path string, attr string, data []byte, flags int) (err error)
2134//sys	signalfd(fd int, sigmask *Sigset_t, maskSize uintptr, flags int) (newfd int, err error) = SYS_SIGNALFD4
2135//sys	Statx(dirfd int, path string, flags int, mask int, stat *Statx_t) (err error)
2136//sys	Sync()
2137//sys	Syncfs(fd int) (err error)
2138//sysnb	Sysinfo(info *Sysinfo_t) (err error)
2139//sys	Tee(rfd int, wfd int, len int, flags int) (n int64, err error)
2140//sysnb	TimerfdCreate(clockid int, flags int) (fd int, err error)
2141//sysnb	TimerfdGettime(fd int, currValue *ItimerSpec) (err error)
2142//sysnb	TimerfdSettime(fd int, flags int, newValue *ItimerSpec, oldValue *ItimerSpec) (err error)
2143//sysnb	Tgkill(tgid int, tid int, sig syscall.Signal) (err error)
2144//sysnb	Times(tms *Tms) (ticks uintptr, err error)
2145//sysnb	Umask(mask int) (oldmask int)
2146//sysnb	Uname(buf *Utsname) (err error)
2147//sys	Unmount(target string, flags int) (err error) = SYS_UMOUNT2
2148//sys	Unshare(flags int) (err error)
2149//sys	write(fd int, p []byte) (n int, err error)
2150//sys	exitThread(code int) (err error) = SYS_EXIT
2151//sys	readv(fd int, iovs []Iovec) (n int, err error) = SYS_READV
2152//sys	writev(fd int, iovs []Iovec) (n int, err error) = SYS_WRITEV
2153//sys	preadv(fd int, iovs []Iovec, offs_l uintptr, offs_h uintptr) (n int, err error) = SYS_PREADV
2154//sys	pwritev(fd int, iovs []Iovec, offs_l uintptr, offs_h uintptr) (n int, err error) = SYS_PWRITEV
2155//sys	preadv2(fd int, iovs []Iovec, offs_l uintptr, offs_h uintptr, flags int) (n int, err error) = SYS_PREADV2
2156//sys	pwritev2(fd int, iovs []Iovec, offs_l uintptr, offs_h uintptr, flags int) (n int, err error) = SYS_PWRITEV2
2157
2158// minIovec is the size of the small initial allocation used by
2159// Readv, Writev, etc.
2160//
2161// This small allocation gets stack allocated, which lets the
2162// common use case of len(iovs) <= minIovs avoid more expensive
2163// heap allocations.
2164const minIovec = 8
2165
2166// appendBytes converts bs to Iovecs and appends them to vecs.
2167func appendBytes(vecs []Iovec, bs [][]byte) []Iovec {
2168	for _, b := range bs {
2169		var v Iovec
2170		v.SetLen(len(b))
2171		if len(b) > 0 {
2172			v.Base = &b[0]
2173		} else {
2174			v.Base = (*byte)(unsafe.Pointer(&_zero))
2175		}
2176		vecs = append(vecs, v)
2177	}
2178	return vecs
2179}
2180
2181// offs2lohi splits offs into its low and high order bits.
2182func offs2lohi(offs int64) (lo, hi uintptr) {
2183	const longBits = SizeofLong * 8
2184	return uintptr(offs), uintptr(uint64(offs) >> (longBits - 1) >> 1) // two shifts to avoid false positive in vet
2185}
2186
2187func Readv(fd int, iovs [][]byte) (n int, err error) {
2188	iovecs := make([]Iovec, 0, minIovec)
2189	iovecs = appendBytes(iovecs, iovs)
2190	n, err = readv(fd, iovecs)
2191	readvRacedetect(iovecs, n, err)
2192	return n, err
2193}
2194
2195func Preadv(fd int, iovs [][]byte, offset int64) (n int, err error) {
2196	iovecs := make([]Iovec, 0, minIovec)
2197	iovecs = appendBytes(iovecs, iovs)
2198	lo, hi := offs2lohi(offset)
2199	n, err = preadv(fd, iovecs, lo, hi)
2200	readvRacedetect(iovecs, n, err)
2201	return n, err
2202}
2203
2204func Preadv2(fd int, iovs [][]byte, offset int64, flags int) (n int, err error) {
2205	iovecs := make([]Iovec, 0, minIovec)
2206	iovecs = appendBytes(iovecs, iovs)
2207	lo, hi := offs2lohi(offset)
2208	n, err = preadv2(fd, iovecs, lo, hi, flags)
2209	readvRacedetect(iovecs, n, err)
2210	return n, err
2211}
2212
2213func readvRacedetect(iovecs []Iovec, n int, err error) {
2214	if !raceenabled {
2215		return
2216	}
2217	for i := 0; n > 0 && i < len(iovecs); i++ {
2218		m := min(int(iovecs[i].Len), n)
2219		n -= m
2220		if m > 0 {
2221			raceWriteRange(unsafe.Pointer(iovecs[i].Base), m)
2222		}
2223	}
2224	if err == nil {
2225		raceAcquire(unsafe.Pointer(&ioSync))
2226	}
2227}
2228
2229func Writev(fd int, iovs [][]byte) (n int, err error) {
2230	iovecs := make([]Iovec, 0, minIovec)
2231	iovecs = appendBytes(iovecs, iovs)
2232	if raceenabled {
2233		raceReleaseMerge(unsafe.Pointer(&ioSync))
2234	}
2235	n, err = writev(fd, iovecs)
2236	writevRacedetect(iovecs, n)
2237	return n, err
2238}
2239
2240func Pwritev(fd int, iovs [][]byte, offset int64) (n int, err error) {
2241	iovecs := make([]Iovec, 0, minIovec)
2242	iovecs = appendBytes(iovecs, iovs)
2243	if raceenabled {
2244		raceReleaseMerge(unsafe.Pointer(&ioSync))
2245	}
2246	lo, hi := offs2lohi(offset)
2247	n, err = pwritev(fd, iovecs, lo, hi)
2248	writevRacedetect(iovecs, n)
2249	return n, err
2250}
2251
2252func Pwritev2(fd int, iovs [][]byte, offset int64, flags int) (n int, err error) {
2253	iovecs := make([]Iovec, 0, minIovec)
2254	iovecs = appendBytes(iovecs, iovs)
2255	if raceenabled {
2256		raceReleaseMerge(unsafe.Pointer(&ioSync))
2257	}
2258	lo, hi := offs2lohi(offset)
2259	n, err = pwritev2(fd, iovecs, lo, hi, flags)
2260	writevRacedetect(iovecs, n)
2261	return n, err
2262}
2263
2264func writevRacedetect(iovecs []Iovec, n int) {
2265	if !raceenabled {
2266		return
2267	}
2268	for i := 0; n > 0 && i < len(iovecs); i++ {
2269		m := min(int(iovecs[i].Len), n)
2270		n -= m
2271		if m > 0 {
2272			raceReadRange(unsafe.Pointer(iovecs[i].Base), m)
2273		}
2274	}
2275}
2276
2277// mmap varies by architecture; see syscall_linux_*.go.
2278//sys	munmap(addr uintptr, length uintptr) (err error)
2279//sys	mremap(oldaddr uintptr, oldlength uintptr, newlength uintptr, flags int, newaddr uintptr) (xaddr uintptr, err error)
2280//sys	Madvise(b []byte, advice int) (err error)
2281//sys	Mprotect(b []byte, prot int) (err error)
2282//sys	Mlock(b []byte) (err error)
2283//sys	Mlockall(flags int) (err error)
2284//sys	Msync(b []byte, flags int) (err error)
2285//sys	Munlock(b []byte) (err error)
2286//sys	Munlockall() (err error)
2287
2288const (
2289	mremapFixed     = MREMAP_FIXED
2290	mremapDontunmap = MREMAP_DONTUNMAP
2291	mremapMaymove   = MREMAP_MAYMOVE
2292)
2293
2294// Vmsplice splices user pages from a slice of Iovecs into a pipe specified by fd,
2295// using the specified flags.
2296func Vmsplice(fd int, iovs []Iovec, flags int) (int, error) {
2297	var p unsafe.Pointer
2298	if len(iovs) > 0 {
2299		p = unsafe.Pointer(&iovs[0])
2300	}
2301
2302	n, _, errno := Syscall6(SYS_VMSPLICE, uintptr(fd), uintptr(p), uintptr(len(iovs)), uintptr(flags), 0, 0)
2303	if errno != 0 {
2304		return 0, syscall.Errno(errno)
2305	}
2306
2307	return int(n), nil
2308}
2309
2310func isGroupMember(gid int) bool {
2311	groups, err := Getgroups()
2312	if err != nil {
2313		return false
2314	}
2315
2316	return slices.Contains(groups, gid)
2317}
2318
2319func isCapDacOverrideSet() bool {
2320	hdr := CapUserHeader{Version: LINUX_CAPABILITY_VERSION_3}
2321	data := [2]CapUserData{}
2322	err := Capget(&hdr, &data[0])
2323
2324	return err == nil && data[0].Effective&(1<<CAP_DAC_OVERRIDE) != 0
2325}
2326
2327//sys	faccessat(dirfd int, path string, mode uint32) (err error)
2328//sys	Faccessat2(dirfd int, path string, mode uint32, flags int) (err error)
2329
2330func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) {
2331	if flags == 0 {
2332		return faccessat(dirfd, path, mode)
2333	}
2334
2335	if err := Faccessat2(dirfd, path, mode, flags); err != ENOSYS && err != EPERM {
2336		return err
2337	}
2338
2339	// The Linux kernel faccessat system call does not take any flags.
2340	// The glibc faccessat implements the flags itself; see
2341	// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/faccessat.c;hb=HEAD
2342	// Because people naturally expect syscall.Faccessat to act
2343	// like C faccessat, we do the same.
2344
2345	if flags & ^(AT_SYMLINK_NOFOLLOW|AT_EACCESS) != 0 {
2346		return EINVAL
2347	}
2348
2349	var st Stat_t
2350	if err := Fstatat(dirfd, path, &st, flags&AT_SYMLINK_NOFOLLOW); err != nil {
2351		return err
2352	}
2353
2354	mode &= 7
2355	if mode == 0 {
2356		return nil
2357	}
2358
2359	var uid int
2360	if flags&AT_EACCESS != 0 {
2361		uid = Geteuid()
2362		if uid != 0 && isCapDacOverrideSet() {
2363			// If CAP_DAC_OVERRIDE is set, file access check is
2364			// done by the kernel in the same way as for root
2365			// (see generic_permission() in the Linux sources).
2366			uid = 0
2367		}
2368	} else {
2369		uid = Getuid()
2370	}
2371
2372	if uid == 0 {
2373		if mode&1 == 0 {
2374			// Root can read and write any file.
2375			return nil
2376		}
2377		if st.Mode&0111 != 0 {
2378			// Root can execute any file that anybody can execute.
2379			return nil
2380		}
2381		return EACCES
2382	}
2383
2384	var fmode uint32
2385	if uint32(uid) == st.Uid {
2386		fmode = (st.Mode >> 6) & 7
2387	} else {
2388		var gid int
2389		if flags&AT_EACCESS != 0 {
2390			gid = Getegid()
2391		} else {
2392			gid = Getgid()
2393		}
2394
2395		if uint32(gid) == st.Gid || isGroupMember(int(st.Gid)) {
2396			fmode = (st.Mode >> 3) & 7
2397		} else {
2398			fmode = st.Mode & 7
2399		}
2400	}
2401
2402	if fmode&mode == mode {
2403		return nil
2404	}
2405
2406	return EACCES
2407}
2408
2409//sys	nameToHandleAt(dirFD int, pathname string, fh *fileHandle, mountID *_C_int, flags int) (err error) = SYS_NAME_TO_HANDLE_AT
2410//sys	openByHandleAt(mountFD int, fh *fileHandle, flags int) (fd int, err error) = SYS_OPEN_BY_HANDLE_AT
2411
2412// fileHandle is the argument to nameToHandleAt and openByHandleAt. We
2413// originally tried to generate it via unix/linux/types.go with "type
2414// fileHandle C.struct_file_handle" but that generated empty structs
2415// for mips64 and mips64le. Instead, hard code it for now (it's the
2416// same everywhere else) until the mips64 generator issue is fixed.
2417type fileHandle struct {
2418	Bytes uint32
2419	Type  int32
2420}
2421
2422// FileHandle represents the C struct file_handle used by
2423// name_to_handle_at (see NameToHandleAt) and open_by_handle_at (see
2424// OpenByHandleAt).
2425type FileHandle struct {
2426	*fileHandle
2427}
2428
2429// NewFileHandle constructs a FileHandle.
2430func NewFileHandle(handleType int32, handle []byte) FileHandle {
2431	const hdrSize = unsafe.Sizeof(fileHandle{})
2432	buf := make([]byte, hdrSize+uintptr(len(handle)))
2433	copy(buf[hdrSize:], handle)
2434	fh := (*fileHandle)(unsafe.Pointer(&buf[0]))
2435	fh.Type = handleType
2436	fh.Bytes = uint32(len(handle))
2437	return FileHandle{fh}
2438}
2439
2440func (fh *FileHandle) Size() int   { return int(fh.fileHandle.Bytes) }
2441func (fh *FileHandle) Type() int32 { return fh.fileHandle.Type }
2442func (fh *FileHandle) Bytes() []byte {
2443	n := fh.Size()
2444	if n == 0 {
2445		return nil
2446	}
2447	return unsafe.Slice((*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(&fh.fileHandle.Type))+4)), n)
2448}
2449
2450// NameToHandleAt wraps the name_to_handle_at system call; it obtains
2451// a handle for a path name.
2452func NameToHandleAt(dirfd int, path string, flags int) (handle FileHandle, mountID int, err error) {
2453	var mid _C_int
2454	// Try first with a small buffer, assuming the handle will
2455	// only be 32 bytes.
2456	size := uint32(32 + unsafe.Sizeof(fileHandle{}))
2457	didResize := false
2458	for {
2459		buf := make([]byte, size)
2460		fh := (*fileHandle)(unsafe.Pointer(&buf[0]))
2461		fh.Bytes = size - uint32(unsafe.Sizeof(fileHandle{}))
2462		err = nameToHandleAt(dirfd, path, fh, &mid, flags)
2463		if err == EOVERFLOW {
2464			if didResize {
2465				// We shouldn't need to resize more than once
2466				return
2467			}
2468			didResize = true
2469			size = fh.Bytes + uint32(unsafe.Sizeof(fileHandle{}))
2470			continue
2471		}
2472		if err != nil {
2473			return
2474		}
2475		return FileHandle{fh}, int(mid), nil
2476	}
2477}
2478
2479// OpenByHandleAt wraps the open_by_handle_at system call; it opens a
2480// file via a handle as previously returned by NameToHandleAt.
2481func OpenByHandleAt(mountFD int, handle FileHandle, flags int) (fd int, err error) {
2482	return openByHandleAt(mountFD, handle.fileHandle, flags)
2483}
2484
2485// Klogset wraps the sys_syslog system call; it sets console_loglevel to
2486// the value specified by arg and passes a dummy pointer to bufp.
2487func Klogset(typ int, arg int) (err error) {
2488	var p unsafe.Pointer
2489	_, _, errno := Syscall(SYS_SYSLOG, uintptr(typ), uintptr(p), uintptr(arg))
2490	if errno != 0 {
2491		return errnoErr(errno)
2492	}
2493	return nil
2494}
2495
2496// RemoteIovec is Iovec with the pointer replaced with an integer.
2497// It is used for ProcessVMReadv and ProcessVMWritev, where the pointer
2498// refers to a location in a different process' address space, which
2499// would confuse the Go garbage collector.
2500type RemoteIovec struct {
2501	Base uintptr
2502	Len  int
2503}
2504
2505//sys	ProcessVMReadv(pid int, localIov []Iovec, remoteIov []RemoteIovec, flags uint) (n int, err error) = SYS_PROCESS_VM_READV
2506//sys	ProcessVMWritev(pid int, localIov []Iovec, remoteIov []RemoteIovec, flags uint) (n int, err error) = SYS_PROCESS_VM_WRITEV
2507
2508//sys	PidfdOpen(pid int, flags int) (fd int, err error) = SYS_PIDFD_OPEN
2509//sys	PidfdGetfd(pidfd int, targetfd int, flags int) (fd int, err error) = SYS_PIDFD_GETFD
2510//sys	PidfdSendSignal(pidfd int, sig Signal, info *Siginfo, flags int) (err error) = SYS_PIDFD_SEND_SIGNAL
2511
2512//sys	shmat(id int, addr uintptr, flag int) (ret uintptr, err error)
2513//sys	shmctl(id int, cmd int, buf *SysvShmDesc) (result int, err error)
2514//sys	shmdt(addr uintptr) (err error)
2515//sys	shmget(key int, size int, flag int) (id int, err error)
2516
2517//sys	getitimer(which int, currValue *Itimerval) (err error)
2518//sys	setitimer(which int, newValue *Itimerval, oldValue *Itimerval) (err error)
2519
2520// MakeItimerval creates an Itimerval from interval and value durations.
2521func MakeItimerval(interval, value time.Duration) Itimerval {
2522	return Itimerval{
2523		Interval: NsecToTimeval(interval.Nanoseconds()),
2524		Value:    NsecToTimeval(value.Nanoseconds()),
2525	}
2526}
2527
2528// A value which may be passed to the which parameter for Getitimer and
2529// Setitimer.
2530type ItimerWhich int
2531
2532// Possible which values for Getitimer and Setitimer.
2533const (
2534	ItimerReal    ItimerWhich = ITIMER_REAL
2535	ItimerVirtual ItimerWhich = ITIMER_VIRTUAL
2536	ItimerProf    ItimerWhich = ITIMER_PROF
2537)
2538
2539// Getitimer wraps getitimer(2) to return the current value of the timer
2540// specified by which.
2541func Getitimer(which ItimerWhich) (Itimerval, error) {
2542	var it Itimerval
2543	if err := getitimer(int(which), &it); err != nil {
2544		return Itimerval{}, err
2545	}
2546
2547	return it, nil
2548}
2549
2550// Setitimer wraps setitimer(2) to arm or disarm the timer specified by which.
2551// It returns the previous value of the timer.
2552//
2553// If the Itimerval argument is the zero value, the timer will be disarmed.
2554func Setitimer(which ItimerWhich, it Itimerval) (Itimerval, error) {
2555	var prev Itimerval
2556	if err := setitimer(int(which), &it, &prev); err != nil {
2557		return Itimerval{}, err
2558	}
2559
2560	return prev, nil
2561}
2562
2563//sysnb	rtSigprocmask(how int, set *Sigset_t, oldset *Sigset_t, sigsetsize uintptr) (err error) = SYS_RT_SIGPROCMASK
2564
2565func PthreadSigmask(how int, set, oldset *Sigset_t) error {
2566	if oldset != nil {
2567		// Explicitly clear in case Sigset_t is larger than _C__NSIG.
2568		*oldset = Sigset_t{}
2569	}
2570	return rtSigprocmask(how, set, oldset, _C__NSIG/8)
2571}
2572
2573//sysnb	getresuid(ruid *_C_int, euid *_C_int, suid *_C_int)
2574//sysnb	getresgid(rgid *_C_int, egid *_C_int, sgid *_C_int)
2575
2576func Getresuid() (ruid, euid, suid int) {
2577	var r, e, s _C_int
2578	getresuid(&r, &e, &s)
2579	return int(r), int(e), int(s)
2580}
2581
2582func Getresgid() (rgid, egid, sgid int) {
2583	var r, e, s _C_int
2584	getresgid(&r, &e, &s)
2585	return int(r), int(e), int(s)
2586}
2587
2588// Pselect is a wrapper around the Linux pselect6 system call.
2589// This version does not modify the timeout argument.
2590func Pselect(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timespec, sigmask *Sigset_t) (n int, err error) {
2591	// Per https://man7.org/linux/man-pages/man2/select.2.html#NOTES,
2592	// The Linux pselect6() system call modifies its timeout argument.
2593	// [Not modifying the argument] is the behavior required by POSIX.1-2001.
2594	var mutableTimeout *Timespec
2595	if timeout != nil {
2596		mutableTimeout = new(Timespec)
2597		*mutableTimeout = *timeout
2598	}
2599
2600	// The final argument of the pselect6() system call is not a
2601	// sigset_t * pointer, but is instead a structure
2602	var kernelMask *sigset_argpack
2603	if sigmask != nil {
2604		wordBits := 32 << (^uintptr(0) >> 63) // see math.intSize
2605
2606		// A sigset stores one bit per signal,
2607		// offset by 1 (because signal 0 does not exist).
2608		// So the number of words needed is ⌈__C_NSIG - 1 / wordBits⌉.
2609		sigsetWords := (_C__NSIG - 1 + wordBits - 1) / (wordBits)
2610
2611		sigsetBytes := uintptr(sigsetWords * (wordBits / 8))
2612		kernelMask = &sigset_argpack{
2613			ss:    sigmask,
2614			ssLen: sigsetBytes,
2615		}
2616	}
2617
2618	return pselect6(nfd, r, w, e, mutableTimeout, kernelMask)
2619}
2620
2621//sys	schedSetattr(pid int, attr *SchedAttr, flags uint) (err error)
2622//sys	schedGetattr(pid int, attr *SchedAttr, size uint, flags uint) (err error)
2623
2624// SchedSetAttr is a wrapper for sched_setattr(2) syscall.
2625// https://man7.org/linux/man-pages/man2/sched_setattr.2.html
2626func SchedSetAttr(pid int, attr *SchedAttr, flags uint) error {
2627	if attr == nil {
2628		return EINVAL
2629	}
2630	attr.Size = SizeofSchedAttr
2631	return schedSetattr(pid, attr, flags)
2632}
2633
2634// SchedGetAttr is a wrapper for sched_getattr(2) syscall.
2635// https://man7.org/linux/man-pages/man2/sched_getattr.2.html
2636func SchedGetAttr(pid int, flags uint) (*SchedAttr, error) {
2637	attr := &SchedAttr{}
2638	if err := schedGetattr(pid, attr, SizeofSchedAttr, flags); err != nil {
2639		return nil, err
2640	}
2641	return attr, nil
2642}
2643
2644//sys	Cachestat(fd uint, crange *CachestatRange, cstat *Cachestat_t, flags uint) (err error)
2645//sys	Mseal(b []byte, flags uint) (err error)
2646
2647//sys	setMemPolicy(mode int, mask *CPUSet, size int) (err error) = SYS_SET_MEMPOLICY
2648
2649func SetMemPolicy(mode int, mask *CPUSet) error {
2650	return setMemPolicy(mode, mask, _CPU_SETSIZE)
2651}