diff --git a/README.md b/README.md index 0eb71a5d..8652ccf6 100644 --- a/README.md +++ b/README.md @@ -220,9 +220,10 @@ LDFLAGS+= $(pkg-config --libs libcpuinfo) - [x] x86-64 (iPhone simulator) - [x] ARMv7 - [x] ARM64 -- [x] OS X +- [x] macOS - [x] x86 - [x] x86-64 + - [x] ARM64 (Apple silicon) - [x] Windows - [x] x86 - [x] x86-64 diff --git a/include/cpuinfo.h b/include/cpuinfo.h index cffa299e..e904d869 100644 --- a/include/cpuinfo.h +++ b/include/cpuinfo.h @@ -1468,6 +1468,7 @@ static inline bool cpuinfo_has_x86_sha(void) { bool dot; bool jscvt; bool fcma; + bool fhm; bool aes; bool sha1; @@ -1737,6 +1738,14 @@ static inline bool cpuinfo_has_arm_fcma(void) { #endif } +static inline bool cpuinfo_has_arm_fhm(void) { + #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 + return cpuinfo_isa.fhm; + #else + return false; + #endif +} + static inline bool cpuinfo_has_arm_aes(void) { #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 return cpuinfo_isa.aes; diff --git a/src/arm/linux/aarch64-isa.c b/src/arm/linux/aarch64-isa.c index 2000e1a1..eeee6c60 100644 --- a/src/arm/linux/aarch64-isa.c +++ b/src/arm/linux/aarch64-isa.c @@ -124,4 +124,8 @@ void cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo( if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SVE2) { isa->sve2 = true; } + if (features & CPUINFO_ARM_LINUX_FEATURE_ASIMDFHM) { + isa->fhm = true; + } + } diff --git a/src/arm/linux/cpuinfo.c b/src/arm/linux/cpuinfo.c index 90e1631e..1b37fd50 100644 --- a/src/arm/linux/cpuinfo.c +++ b/src/arm/linux/cpuinfo.c @@ -283,6 +283,8 @@ static void parse_features( #if CPUINFO_ARCH_ARM64 processor->features |= CPUINFO_ARM_LINUX_FEATURE_ASIMDRDM; #endif + } else if (memcmp(feature_start, "asimdfhm", feature_length) == 0) { + processor->features |= CPUINFO_ARM_LINUX_FEATURE_ASIMDFHM; #if CPUINFO_ARCH_ARM } else if (memcmp(feature_start, "fastmult", feature_length) == 0) { processor->features |= CPUINFO_ARM_LINUX_FEATURE_FASTMULT; diff --git a/src/arm/mach/init.c b/src/arm/mach/init.c index dbea578c..dbe99364 100644 --- a/src/arm/mach/init.c +++ b/src/arm/mach/init.c @@ -14,6 +14,12 @@ #include #include +#define SAFE_FREE_NULL(x) \ + do { \ + if((x)) free((x)); \ + (x) = NULL; \ + } while(0); + /* Polyfill recent CPUFAMILY_ARM_* values for older SDKs */ #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xE81E7EF6 @@ -54,6 +60,54 @@ struct cpuinfo_arm_isa cpuinfo_isa = { #endif }; +struct cache_array { + struct cpuinfo_cache *caches; + uint32_t count; +}; + +/* + * iOS 15 and macOS Monterey 12 added sysctls to describe configuration information + * where not all cores are the same (number of cores, cache sizes). + * + * Each perflevel sysctl has a prefix of `hw.perflevel??.` where ?? is the + * perflevel index, starting at zero. The total number of perflevels are + * exposed via the `hw.nperflevels` sysctl. Higher performance perflevels + * have lower indexes. + * + * sysctls: + * - hw.nperflevels - number of different types of cores / cache configs (perflevels) + * - hw.perflevel?? + * - .physicalcpu - number of enabled physical cores for perflevel ?? + * - .physicalcpu_max - number of physical cores for perflevel ?? + * - .logicalcpu - number of enabled logical cores for perflevel ?? + * - .logicalcpu_max - number of logical cores for perflevel ?? + * - .l1icachesize - size in bytes of L1 instruction cache for cores in perflevel ?? + * - .l1dcachesize - size in bytes of L1 data cache for cores in perflevel ?? + * - .l2cachesize - size in bytes of L2 data cache for cores in perflevel ?? + * - .cpusperl2 - number of cores that share an L2 cache in perflevel ?? + * - .l3cachesize - size in bytes of L3 data cache for cores in perflevel ?? + * - .cpusperl3 - number of cores that share an L2 cache in perflevel ?? + * + * Technically, these perflevels could be in src/mach/api.h since they are supported + * across architectures (x86_64 and arm64). x86_64 doesn't currently have multiple + * perflevels, which means there's not much benefit there. + */ +struct mach_perflevel { + uint32_t physicalcpu; + uint32_t physicalcpu_max; + uint32_t logicalcpu; + uint32_t logicalcpu_max; + uint32_t l1icachesize; + uint32_t l1dcachesize; + uint32_t l2cachesize; + uint32_t cpusperl2; + uint32_t l3cachesize; + uint32_t cpusperl3; + + uint32_t core_start; /* first core index this perflevel describes */ + uint32_t processor_start; /* first processor index this perflevel describes */ +}; + static uint32_t get_sys_info(int type_specifier, const char* name) { size_t size = 0; uint32_t result = 0; @@ -128,7 +182,7 @@ static enum cpuinfo_uarch decode_uarch(uint32_t cpu_family, uint32_t cpu_subtype #endif } -static void decode_package_name(char* package_name) { +static void decode_hw_machine_package_name(char* package_name) { size_t size; if (sysctlbyname("hw.machine", NULL, &size, NULL, 0) != 0) { cpuinfo_log_warning("sysctlbyname(\"hw.machine\") failed: %s", strerror(errno)); @@ -252,55 +306,30 @@ static void decode_package_name(char* package_name) { } } -void cpuinfo_arm_mach_init(void) { - struct cpuinfo_processor* processors = NULL; - struct cpuinfo_core* cores = NULL; - struct cpuinfo_cluster* clusters = NULL; - struct cpuinfo_package* packages = NULL; - struct cpuinfo_uarch_info* uarchs = NULL; - struct cpuinfo_cache* l1i = NULL; - struct cpuinfo_cache* l1d = NULL; - struct cpuinfo_cache* l2 = NULL; - struct cpuinfo_cache* l3 = NULL; - - struct cpuinfo_mach_topology mach_topology = cpuinfo_mach_detect_topology(); - processors = calloc(mach_topology.threads, sizeof(struct cpuinfo_processor)); - if (processors == NULL) { - cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" logical processors", - mach_topology.threads * sizeof(struct cpuinfo_processor), mach_topology.threads); - goto cleanup; - } - cores = calloc(mach_topology.cores, sizeof(struct cpuinfo_core)); - if (cores == NULL) { - cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" cores", - mach_topology.cores * sizeof(struct cpuinfo_core), mach_topology.cores); - goto cleanup; - } - packages = calloc(mach_topology.packages, sizeof(struct cpuinfo_package)); - if (packages == NULL) { - cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" packages", - mach_topology.packages * sizeof(struct cpuinfo_package), mach_topology.packages); - goto cleanup; +static void read_package_name(char* package_name) { + decode_hw_machine_package_name(package_name); + if(package_name[0] != '\0'){ + return; } - const uint32_t threads_per_core = mach_topology.threads / mach_topology.cores; - const uint32_t threads_per_package = mach_topology.threads / mach_topology.packages; - const uint32_t cores_per_package = mach_topology.cores / mach_topology.packages; + /* Try to pull package name from machdep.cpu.brand_string */ + size_t size; + if (sysctlbyname("machdep.cpu.brand_string", NULL, &size, NULL, 0) != 0) { + cpuinfo_log_warning("sysctlbyname(\"machdep.cpu.brand_string\") failed: %s", strerror(errno)); + return; + } - for (uint32_t i = 0; i < mach_topology.packages; i++) { - packages[i] = (struct cpuinfo_package) { - .processor_start = i * threads_per_package, - .processor_count = threads_per_package, - .core_start = i * cores_per_package, - .core_count = cores_per_package, - }; - decode_package_name(packages[i].name); + char *brand_string = alloca(size); + if (sysctlbyname("machdep.cpu.brand_string", brand_string, &size, NULL, 0) != 0) { + cpuinfo_log_warning("sysctlbyname(\"machdep.cpu.brand_string\") failed: %s", strerror(errno)); + return; } + cpuinfo_log_debug("machdep.cpu.brand_string: %s", brand_string); + strlcpy(package_name, brand_string, CPUINFO_PACKAGE_NAME_MAX); +} - const uint32_t cpu_family = get_sys_info_by_name("hw.cpufamily"); - const uint32_t cpu_type = get_sys_info_by_name("hw.cputype"); - const uint32_t cpu_subtype = get_sys_info_by_name("hw.cpusubtype"); +static void detect_isa(uint32_t cpu_family, uint32_t cpu_type, uint32_t cpu_subtype) { switch (cpu_type) { case CPU_TYPE_ARM64: cpuinfo_isa.aes = true; @@ -336,110 +365,240 @@ void cpuinfo_arm_mach_init(void) { break; #endif } + /* - * Support for ARMv8.1 Atomics & FP16 arithmetic instructions is supposed to be detected via - * sysctlbyname calls with "hw.optional.armv8_1_atomics" and "hw.optional.neon_fp16" arguments - * (see https://devstreaming-cdn.apple.com/videos/wwdc/2018/409t8zw7rumablsh/409/409_whats_new_in_llvm.pdf), - * but on new iOS versions these calls just fail with EPERM. - * - * Thus, we whitelist CPUs known to support these instructions. + * iOS 15 and macOS Monterey 12 added sysctls for Arm features. Use them where + * possible. Otherwise, fallback to hardcoded set of CPUs with known + * support. */ - switch (cpu_family) { - case CPUFAMILY_ARM_MONSOON_MISTRAL: - case CPUFAMILY_ARM_VORTEX_TEMPEST: - case CPUFAMILY_ARM_LIGHTNING_THUNDER: - case CPUFAMILY_ARM_FIRESTORM_ICESTORM: - #if CPUINFO_ARCH_ARM64 - cpuinfo_isa.atomics = true; - #endif - cpuinfo_isa.fp16arith = true; + + const uint32_t has_FEAT_LSE = get_sys_info_by_name("hw.optional.arm.FEAT_LSE"); + if (has_FEAT_LSE != 0) { + cpuinfo_isa.atomics = true; } + #if CPUINFO_ARCH_ARM64 + else { + switch (cpu_family) { + case CPUFAMILY_ARM_MONSOON_MISTRAL: + case CPUFAMILY_ARM_VORTEX_TEMPEST: + case CPUFAMILY_ARM_LIGHTNING_THUNDER: + case CPUFAMILY_ARM_FIRESTORM_ICESTORM: + cpuinfo_isa.atomics = true; + } + } + #endif - /* - * There does not yet seem to exist an OS mechanism to detect support for - * ARMv8.2 optional dot-product instructions, so we currently whitelist CPUs - * known to support these instruction. - */ - switch (cpu_family) { - case CPUFAMILY_ARM_LIGHTNING_THUNDER: - case CPUFAMILY_ARM_FIRESTORM_ICESTORM: - cpuinfo_isa.dot = true; + const uint32_t has_FEAT_RDM = get_sys_info_by_name("hw.optional.arm.FEAT_RDM"); + if (has_FEAT_RDM != 0) { + cpuinfo_isa.rdm = true; } - uint32_t num_clusters = 1; - for (uint32_t i = 0; i < mach_topology.cores; i++) { - cores[i] = (struct cpuinfo_core) { - .processor_start = i * threads_per_core, - .processor_count = threads_per_core, - .core_id = i % cores_per_package, - .package = packages + i / cores_per_package, - .vendor = cpuinfo_vendor_apple, - .uarch = decode_uarch(cpu_family, cpu_subtype, i, mach_topology.cores), - }; - if (i != 0 && cores[i].uarch != cores[i - 1].uarch) { - num_clusters++; + const uint32_t has_FEAT_FP16 = get_sys_info_by_name("hw.optional.arm.FEAT_FP16"); + if (has_FEAT_FP16 != 0) { + cpuinfo_isa.fp16arith = true; + } else { + switch (cpu_family) { + case CPUFAMILY_ARM_MONSOON_MISTRAL: + case CPUFAMILY_ARM_VORTEX_TEMPEST: + case CPUFAMILY_ARM_LIGHTNING_THUNDER: + case CPUFAMILY_ARM_FIRESTORM_ICESTORM: + cpuinfo_isa.fp16arith = true; } } - for (uint32_t i = 0; i < mach_topology.threads; i++) { - const uint32_t smt_id = i % threads_per_core; - const uint32_t core_id = i / threads_per_core; - const uint32_t package_id = i / threads_per_package; - processors[i].smt_id = smt_id; - processors[i].core = &cores[core_id]; - processors[i].package = &packages[package_id]; + const uint32_t has_FEAT_DotProd = get_sys_info_by_name("hw.optional.arm.FEAT_DotProd"); + if (has_FEAT_DotProd != 0) { + cpuinfo_isa.dot = true; + } else { + switch (cpu_family) { + case CPUFAMILY_ARM_LIGHTNING_THUNDER: + case CPUFAMILY_ARM_FIRESTORM_ICESTORM: + cpuinfo_isa.dot = true; + } } - clusters = calloc(num_clusters, sizeof(struct cpuinfo_cluster)); - if (clusters == NULL) { - cpuinfo_log_error( - "failed to allocate %zu bytes for descriptions of %"PRIu32" clusters", - num_clusters * sizeof(struct cpuinfo_cluster), num_clusters); - goto cleanup; + const uint32_t has_FEAT_JSCVT = get_sys_info_by_name("hw.optional.arm.FEAT_JSCVT"); + if (has_FEAT_JSCVT != 0) { + cpuinfo_isa.jscvt = true; } - uarchs = calloc(num_clusters, sizeof(struct cpuinfo_uarch_info)); - if (uarchs == NULL) { - cpuinfo_log_error( - "failed to allocate %zu bytes for descriptions of %"PRIu32" uarchs", - num_clusters * sizeof(enum cpuinfo_uarch), num_clusters); - goto cleanup; + + const uint32_t has_FEAT_FCMA = get_sys_info_by_name("hw.optional.arm.FEAT_FCMA"); + if (has_FEAT_FCMA != 0) { + cpuinfo_isa.fcma = true; } - uint32_t cluster_idx = UINT32_MAX; - for (uint32_t i = 0; i < mach_topology.cores; i++) { - if (i == 0 || cores[i].uarch != cores[i - 1].uarch) { - cluster_idx++; - uarchs[cluster_idx] = (struct cpuinfo_uarch_info) { - .uarch = cores[i].uarch, - .processor_count = 1, - .core_count = 1, - }; - clusters[cluster_idx] = (struct cpuinfo_cluster) { - .processor_start = i * threads_per_core, - .processor_count = 1, - .core_start = i, - .core_count = 1, - .cluster_id = cluster_idx, - .package = cores[i].package, - .vendor = cores[i].vendor, - .uarch = cores[i].uarch, - }; + + const uint32_t has_FEAT_FHM = get_sys_info_by_name("hw.optional.arm.FEAT_FHM"); + if (has_FEAT_FHM != 0) { + cpuinfo_isa.fhm = true; + } else { + // Prior to iOS 15 or macOS Monterey 12, use 'hw.optional.armv8_2_fhm' + const uint32_t has_FEAT_FHM_legacy = get_sys_info_by_name("hw.optional.armv8_2_fhm"); + if (has_FEAT_FHM_legacy != 0) { + cpuinfo_isa.fhm = true; + } + } +} + +static char * alloc_sysctl_perflevel_string(uint32_t perflevel, const char * const perflevel_suffix) { + char * ret = NULL; + int err = asprintf(&ret, "hw.perflevel%u.%s", perflevel, perflevel_suffix); + if(err == -1 || ret == NULL){ + cpuinfo_log_error("failed to allocate memory for hw.perflevel* string"); + return NULL; + } + + return ret; +} + +static struct mach_perflevel * read_perflevels(const uint32_t nperflevels) { + struct mach_perflevel * perflevels = NULL; + + perflevels = calloc(nperflevels, sizeof (*perflevels)); + if(!perflevels){ + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" perflevels", + nperflevels * sizeof(*perflevels), nperflevels); + return NULL; + } + + /* + * Each perflevel sysctl is of the form "hw.perflevel." + * where is an integer starting at zero and incrementing for each level + * and is the name of the sysctl. Since they change based on the + * level, we need to build them dynamically. + */ + char * sysctl_physicalcpu = NULL; + char * sysctl_physicalcpu_max = NULL; + char * sysctl_logicalcpu = NULL; + char * sysctl_logicalcpu_max = NULL; + char * sysctl_l1icachesize = NULL; + char * sysctl_l1dcachesize = NULL; + char * sysctl_l2cachesize = NULL; + char * sysctl_cpusperl2 = NULL; + char * sysctl_l3cachesize = NULL; + char * sysctl_cpusperl3 = NULL; + + uint32_t core_index = 0; + uint32_t processor_index = 0; + + bool success = true; + uint32_t i = 0; + for(; icount = mach_topology.threads / threads_per_l1; + l1d->count = l1i->count; + cpuinfo_log_debug("detected %"PRIu32" L1 caches", l1i->count); } - uint32_t threads_per_l2 = 0, l2_count = 0; + uint32_t threads_per_l2 = 0; if (l2_cache_size != 0) { /* Assume L2 cache is shared between all cores */ threads_per_l2 = mach_topology.cores; - l2_count = 1; - cpuinfo_log_debug("detected %"PRIu32" L2 caches", l2_count); + l2->count = 1; + cpuinfo_log_debug("detected %"PRIu32" L2 caches", l2->count); } - uint32_t threads_per_l3 = 0, l3_count = 0; + uint32_t threads_per_l3 = 0; if (l3_cache_size != 0) { /* Assume L3 cache is shared between all cores */ threads_per_l3 = mach_topology.cores; - l3_count = 1; - cpuinfo_log_debug("detected %"PRIu32" L3 caches", l3_count); + l3->count = 1; + cpuinfo_log_debug("detected %"PRIu32" L3 caches", l3->count); } if (l1i_cache_size != 0) { - l1i = calloc(l1_count, sizeof(struct cpuinfo_cache)); - if (l1i == NULL) { + l1i->caches = calloc(l1i->count, sizeof(*(l1i->caches))); + if (l1i->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1I caches", - l1_count * sizeof(struct cpuinfo_cache), l1_count); - goto cleanup; + l1i->count * sizeof(*(l1i->caches)), l1i->count); + return false; } - for (uint32_t c = 0; c < l1_count; c++) { - l1i[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l1i->count; c++) { + l1i->caches[c] = (struct cpuinfo_cache) { .size = l1i_cache_size, .associativity = l1_cache_associativity, .sets = l1i_cache_size / (l1_cache_associativity * cacheline_size), @@ -496,20 +660,17 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l1, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l1i = &l1i[t / threads_per_l1]; - } } if (l1d_cache_size != 0) { - l1d = calloc(l1_count, sizeof(struct cpuinfo_cache)); - if (l1d == NULL) { + l1d->caches = calloc(l1d->count, sizeof(*(l1d->caches))); + if (l1d->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1D caches", - l1_count * sizeof(struct cpuinfo_cache), l1_count); - goto cleanup; + l1d->count * sizeof(*(l1d->caches)), l1d->count); + return false; } - for (uint32_t c = 0; c < l1_count; c++) { - l1d[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l1d->count; c++) { + l1d->caches[c] = (struct cpuinfo_cache) { .size = l1d_cache_size, .associativity = l1_cache_associativity, .sets = l1d_cache_size / (l1_cache_associativity * cacheline_size), @@ -520,20 +681,17 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l1, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l1d = &l1d[t / threads_per_l1]; - } } - if (l2_count != 0) { - l2 = calloc(l2_count, sizeof(struct cpuinfo_cache)); - if (l2 == NULL) { + if (l2->count != 0) { + l2->caches = calloc(l2->count, sizeof(*(l2->caches))); + if (l2->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L2 caches", - l2_count * sizeof(struct cpuinfo_cache), l2_count); - goto cleanup; + l2->count * sizeof(*(l2->caches)), l2->count); + return false; } - for (uint32_t c = 0; c < l2_count; c++) { - l2[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l2->count; c++) { + l2->caches[c] = (struct cpuinfo_cache) { .size = l2_cache_size, .associativity = l2_cache_associativity, .sets = l2_cache_size / (l2_cache_associativity * cacheline_size), @@ -544,20 +702,17 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l2, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l2 = &l2[0]; - } } - if (l3_count != 0) { - l3 = calloc(l3_count, sizeof(struct cpuinfo_cache)); - if (l3 == NULL) { + if (l3->count != 0) { + l3->caches = calloc(l3->count, sizeof(*(l3->caches))); + if (l3->caches == NULL) { cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L3 caches", - l3_count * sizeof(struct cpuinfo_cache), l3_count); - goto cleanup; + l3->count * sizeof(*(l3->caches)), l3->count); + return false; } - for (uint32_t c = 0; c < l3_count; c++) { - l3[c] = (struct cpuinfo_cache) { + for (uint32_t c = 0; c < l3->count; c++) { + l3->caches[c] = (struct cpuinfo_cache) { .size = l3_cache_size, .associativity = l3_cache_associativity, .sets = l3_cache_size / (l3_cache_associativity * cacheline_size), @@ -568,31 +723,468 @@ void cpuinfo_arm_mach_init(void) { .processor_count = threads_per_l3, }; } - for (uint32_t t = 0; t < mach_topology.threads; t++) { - processors[t].cache.l3 = &l3[0]; + } + + return true; +} + +bool detect_caches_using_perflevels( + const struct cpuinfo_mach_topology mach_topology, + const struct mach_perflevel * const perflevels, + const uint32_t nperflevels, + struct cache_array *l1i, + struct cache_array *l1d, + struct cache_array *l2, + struct cache_array *l3 +) +{ + if(!l1i || !l1d || !l2 || !l3){ + cpuinfo_log_error("cannot detect caches. no place to store results."); + return false; + } + + const uint32_t cacheline_size = get_sys_info(HW_CACHELINE, "HW_CACHELINE"); + /* + * Cache associativity, partitions, and flags values here are copied from + * previous implementation. + */ + const uint32_t l1_cache_associativity = 4; + const uint32_t l2_cache_associativity = 8; + const uint32_t l3_cache_associativity = 16; + const uint32_t cache_partitions = 1; + const uint32_t cache_flags = 0; + + l1i->count = 0; + l1d->count = 0; + l2->count = 0; + l3->count = 0; + for(uint32_t pl=0; plcount += perflevels[pl].physicalcpu_max; + } + + if(perflevels[pl].l1dcachesize != 0){ + /* One l1d cache per core */ + l1d->count += perflevels[pl].physicalcpu_max; + } + + if(perflevels[pl].cpusperl2 != 0){ + l2->count += perflevels[pl].physicalcpu_max / perflevels[pl].cpusperl2; + } + + if(perflevels[pl].cpusperl3 != 0){ + l3->count += perflevels[pl].physicalcpu_max / perflevels[pl].cpusperl3; } } + if(l1i->count != 0){ + l1i->caches = calloc(l1i->count, sizeof(*(l1i->caches))); + if(l1i->caches == NULL){ + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1I caches", + l1i->count * sizeof(*(l1i->caches)), l1i->count); + return false; + } + for(uint32_t pl=0; plcaches[c] = (struct cpuinfo_cache) { + .size = perflevels[pl].l1icachesize, + .associativity = l1_cache_associativity, + .sets = perflevels[pl].l1icachesize / (l1_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = c * threads_per_l1, + .processor_count = threads_per_l1, + }; + } + } + } + } + + if(l1d->count != 0){ + l1d->caches = calloc(l1d->count, sizeof(*(l1d->caches))); + if(l1d->caches == NULL){ + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L1D caches", + l1d->count * sizeof(*(l1d->caches)), l1d->count); + return false; + } + for(uint32_t pl=0; plcaches[c] = (struct cpuinfo_cache) { + .size = perflevels[pl].l1dcachesize, + .associativity = l1_cache_associativity, + .sets = perflevels[pl].l1dcachesize / (l1_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = c * threads_per_l1, + .processor_count = threads_per_l1, + }; + } + } + } + } + + if(l2->count != 0){ + l2->caches = calloc(l2->count, sizeof(*(l2->caches))); + if(l2->caches == NULL){ + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L2 caches", + l2->count * sizeof(*(l2->caches)), l2->count); + return false; + } + uint32_t cache_index = 0; + for(uint32_t pl=0; plcaches[cache_index] = (struct cpuinfo_cache) { + .size = perflevels[pl].l2cachesize, + .associativity = l2_cache_associativity, + .sets = perflevels[pl].l2cachesize / (l2_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = cache_index * threads_per_l2, + .processor_count = threads_per_l2, + }; + } + } + } + } + + if(l3->count != 0){ + l3->caches = calloc(l3->count, sizeof(*(l3->caches))); + if(l3->caches == NULL){ + cpuinfo_log_error("failed to allocate %zu bytes for descriptions of %"PRIu32" L3 caches", + l3->count * sizeof(*(l3->caches)), l3->count); + return false; + } + uint32_t cache_index = 0; + for(uint32_t pl=0; plcaches[cache_index] = (struct cpuinfo_cache) { + .size = perflevels[pl].l3cachesize, + .associativity = l3_cache_associativity, + .sets = perflevels[pl].l3cachesize / (l3_cache_associativity * cacheline_size), + .partitions = cache_partitions, + .line_size = cacheline_size, + .flags = cache_flags, + .processor_start = cache_index * threads_per_l3, + .processor_count = threads_per_l3, + }; + } + } + } + } + + return true; +} + +void cpuinfo_arm_mach_init(void) { + const uint32_t cpu_family = get_sys_info_by_name("hw.cpufamily"); + const uint32_t cpu_type = get_sys_info_by_name("hw.cputype"); + const uint32_t cpu_subtype = get_sys_info_by_name("hw.cpusubtype"); + + detect_isa(cpu_family, cpu_type, cpu_subtype); + + struct cpuinfo_processor* processors = NULL; + struct cpuinfo_core* cores = NULL; + struct cpuinfo_cluster* clusters = NULL; + struct cpuinfo_package* packages = NULL; + struct cpuinfo_uarch_info* uarchs = NULL; + struct cache_array l1i = {0}; + struct cache_array l1d = {0}; + struct cache_array l2 = {0}; + struct cache_array l3 = {0}; + + struct cpuinfo_mach_topology mach_topology = cpuinfo_mach_detect_topology(); + + /* + * iOS 15 and macOS Monterey 12 added sysctls for specifying different performance + * levels. Probe `hw.nperflevels` to see if they're present. If so, + * read and validate them. + */ + struct mach_perflevel * perflevels = NULL; + const uint32_t nperflevels = get_sys_info_by_name("hw.nperflevels"); + if(nperflevels > 1){ + perflevels = read_perflevels(nperflevels); + + if(!perflevels){ + cpuinfo_log_error("failed to initialize perflevels"); + goto cleanup; + } + + /* Double-check topology counts */ + uint32_t cores = 0; + uint32_t threads = 0; + for(uint32_t i=0; i 0 && perflevels){ + if(nperflevels != num_clusters){ + cpuinfo_log_error("mismatch topology cluster count (%"PRIu32" != %"PRIu32").", + nperflevels, num_clusters); + goto cleanup; + } + } + + clusters = calloc(num_clusters, sizeof(*clusters)); + if (clusters == NULL) { + cpuinfo_log_error( + "failed to allocate %zu bytes for descriptions of %"PRIu32" clusters", + num_clusters * sizeof(*clusters), num_clusters); + goto cleanup; + } + uarchs = calloc(num_clusters, sizeof(*uarchs)); + if (uarchs == NULL) { + cpuinfo_log_error( + "failed to allocate %zu bytes for descriptions of %"PRIu32" uarchs", + num_clusters * sizeof(*uarchs), num_clusters); + goto cleanup; + } + uint32_t cluster_idx = UINT32_MAX; + for (uint32_t i = 0; i < mach_topology.cores; i++) { + if (i == 0 || cores[i].uarch != cores[i - 1].uarch) { + cluster_idx++; + uarchs[cluster_idx] = (struct cpuinfo_uarch_info) { + .uarch = cores[i].uarch, + .processor_count = 1, + .core_count = 1, + }; + clusters[cluster_idx] = (struct cpuinfo_cluster) { + .processor_start = i * threads_per_core, + .processor_count = 1, + .core_start = i, + .core_count = 1, + .cluster_id = cluster_idx, + .package = cores[i].package, + .vendor = cores[i].vendor, + .uarch = cores[i].uarch, + }; + } else { + uarchs[cluster_idx].processor_count++; + uarchs[cluster_idx].core_count++; + clusters[cluster_idx].processor_count++; + clusters[cluster_idx].core_count++; + } + cores[i].cluster = &clusters[cluster_idx]; + } + + for (uint32_t i = 0; i < mach_topology.threads; i++) { + const uint32_t core_id = i / threads_per_core; + processors[i].cluster = cores[core_id].cluster; + } + + for (uint32_t i = 0; i < mach_topology.packages; i++) { + packages[i].cluster_start = 0; + packages[i].cluster_count = num_clusters; + } + + /* Detect and populate caches */ + + /* + * Prefer perflevels to detect caches. Fallback on error or if + * perflevels are not available. + */ + bool cachesDetected = false; + if(nperflevels > 0 && perflevels){ + cachesDetected = detect_caches_using_perflevels(mach_topology, perflevels, nperflevels, &l1i, &l1d, &l2, &l3); + } + + if(!cachesDetected){ + cachesDetected = detect_caches_legacy(mach_topology, &l1i, &l1d, &l2, &l3); + if(!cachesDetected){ + goto cleanup; + } + } + + /* Associate processors with caches */ + + if(l1i.caches && l1i.count > 0){ + for(uint32_t c=0; c 0){ + for(uint32_t c=0; c 0){ + for(uint32_t c=0; c 0){ + for(uint32_t c=0; c #include +#include #include #include @@ -55,26 +56,76 @@ void report_cache( } } +void report_distinct_caches( + uint32_t count, const struct cpuinfo_cache *cache, + uint32_t level, const char *label) +{ + uint32_t similar_count = 0; + uint32_t prev = 0; + for(uint32_t i=0; i 0){ + report_cache(similar_count, &cache[prev], level, label); + } +} + +void debug_print_caches(const char *label, const struct cpuinfo_cache * const cache, uint32_t count) +{ + for(uint32_t i=0; i 1 && 0 == strcmp(argv[1], "-verbose")){ + debug_print_caches("L1I", cpuinfo_get_l1i_caches(), cpuinfo_get_l1i_caches_count()); + debug_print_caches("L1D", cpuinfo_get_l1d_caches(), cpuinfo_get_l1d_caches_count()); + debug_print_caches("L2", cpuinfo_get_l2_caches(), cpuinfo_get_l2_caches_count()); + debug_print_caches("L3", cpuinfo_get_l3_caches(), cpuinfo_get_l3_caches_count()); + debug_print_caches("L4", cpuinfo_get_l4_caches(), cpuinfo_get_l4_caches_count()); + } + printf("Max cache size (upper bound): %"PRIu32" bytes\n", cpuinfo_get_max_cache_size()); if (cpuinfo_get_l1i_caches_count() != 0 && (cpuinfo_get_l1i_cache(0)->flags & CPUINFO_CACHE_UNIFIED) == 0) { - report_cache(cpuinfo_get_l1i_caches_count(), cpuinfo_get_l1i_cache(0), 1, "instruction"); + report_distinct_caches(cpuinfo_get_l1i_caches_count(), cpuinfo_get_l1i_caches(), 1, "instruction"); } if (cpuinfo_get_l1d_caches_count() != 0) { - report_cache(cpuinfo_get_l1d_caches_count(), cpuinfo_get_l1d_cache(0), 1, "data"); + report_distinct_caches(cpuinfo_get_l1d_caches_count(), cpuinfo_get_l1d_caches(), 1, "data"); } if (cpuinfo_get_l2_caches_count() != 0) { - report_cache(cpuinfo_get_l2_caches_count(), cpuinfo_get_l2_cache(0), 2, "data"); + report_distinct_caches(cpuinfo_get_l2_caches_count(), cpuinfo_get_l2_caches(), 2, "data"); } if (cpuinfo_get_l3_caches_count() != 0) { - report_cache(cpuinfo_get_l3_caches_count(), cpuinfo_get_l3_cache(0), 3, "data"); + report_distinct_caches(cpuinfo_get_l3_caches_count(), cpuinfo_get_l3_caches(), 3, "data"); } if (cpuinfo_get_l4_caches_count() != 0) { - report_cache(cpuinfo_get_l4_caches_count(), cpuinfo_get_l4_cache(0), 4, "data"); + report_distinct_caches(cpuinfo_get_l4_caches_count(), cpuinfo_get_l4_caches(), 4, "data"); } } diff --git a/tools/isa-info.c b/tools/isa-info.c index 92abb57d..7f390660 100644 --- a/tools/isa-info.c +++ b/tools/isa-info.c @@ -133,6 +133,8 @@ int main(int argc, char** argv) { printf("\tVFPv4: %s\n", cpuinfo_has_arm_vfpv4() ? "yes" : "no"); printf("\tVFPv4+D32: %s\n", cpuinfo_has_arm_vfpv4_d32() ? "yes" : "no"); printf("\tVJCVT: %s\n", cpuinfo_has_arm_jscvt() ? "yes" : "no"); + printf("\tFMLAL/FMLSL: %s\n", cpuinfo_has_arm_fhm() ? "yes" : "no"); + printf("SIMD extensions:\n"); printf("\tWMMX: %s\n", cpuinfo_has_arm_wmmx() ? "yes" : "no"); @@ -144,6 +146,7 @@ int main(int argc, char** argv) { printf("\tNEON FP16 arithmetics: %s\n", cpuinfo_has_arm_neon_fp16_arith() ? "yes" : "no"); printf("\tNEON complex: %s\n", cpuinfo_has_arm_fcma() ? "yes" : "no"); printf("\tNEON dot product: %s\n", cpuinfo_has_arm_neon_dot() ? "yes" : "no"); + printf("\tNEON VFMLAL/VFMLSL: %s\n", cpuinfo_has_arm_fhm() ? "yes" : "no"); printf("Cryptography extensions:\n"); printf("\tAES: %s\n", cpuinfo_has_arm_aes() ? "yes" : "no"); @@ -157,6 +160,7 @@ int main(int argc, char** argv) { printf("\tARM v8.1 atomics: %s\n", cpuinfo_has_arm_atomics() ? "yes" : "no"); printf("\tARM v8.1 SQRDMLxH: %s\n", cpuinfo_has_arm_neon_rdm() ? "yes" : "no"); printf("\tARM v8.2 FP16 arithmetics: %s\n", cpuinfo_has_arm_fp16_arith() ? "yes" : "no"); + printf("\tARM v8.2 FHM: %s\n", cpuinfo_has_arm_fhm() ? "yes" : "no"); printf("\tARM v8.3 dot product: %s\n", cpuinfo_has_arm_neon_dot() ? "yes" : "no"); printf("\tARM v8.3 JS conversion: %s\n", cpuinfo_has_arm_jscvt() ? "yes" : "no"); printf("\tARM v8.3 complex: %s\n", cpuinfo_has_arm_fcma() ? "yes" : "no");