diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..ff66d5a --- /dev/null +++ b/404.html @@ -0,0 +1,176 @@ + + + + + + + + Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • +
  • +
  • +
+
+
+
+
+ + +

404

+ +

Page not found

+ + +
+
+ +
+
+ +
+ +
+ +
+ + + + + +
+ + + + + + + + diff --git a/css/fonts/Roboto-Slab-Bold.woff b/css/fonts/Roboto-Slab-Bold.woff new file mode 100644 index 0000000..6cb6000 Binary files /dev/null and b/css/fonts/Roboto-Slab-Bold.woff differ diff --git a/css/fonts/Roboto-Slab-Bold.woff2 b/css/fonts/Roboto-Slab-Bold.woff2 new file mode 100644 index 0000000..7059e23 Binary files /dev/null and b/css/fonts/Roboto-Slab-Bold.woff2 differ diff --git a/css/fonts/Roboto-Slab-Regular.woff b/css/fonts/Roboto-Slab-Regular.woff new file mode 100644 index 0000000..f815f63 Binary files /dev/null and b/css/fonts/Roboto-Slab-Regular.woff differ diff --git a/css/fonts/Roboto-Slab-Regular.woff2 b/css/fonts/Roboto-Slab-Regular.woff2 new file mode 100644 index 0000000..f2c76e5 Binary files /dev/null and b/css/fonts/Roboto-Slab-Regular.woff2 differ diff --git a/css/fonts/fontawesome-webfont.eot b/css/fonts/fontawesome-webfont.eot new file mode 100644 index 0000000..e9f60ca Binary files /dev/null and b/css/fonts/fontawesome-webfont.eot differ diff --git a/css/fonts/fontawesome-webfont.svg b/css/fonts/fontawesome-webfont.svg new file mode 100644 index 0000000..855c845 --- /dev/null +++ b/css/fonts/fontawesome-webfont.svg @@ -0,0 +1,2671 @@ + + + + +Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016 + By ,,, +Copyright Dave Gandy 2016. All rights reserveddiff --git a/css/fonts/fontawesome-webfont.ttf b/css/fonts/fontawesome-webfont.ttf new file mode 100644 index 0000000..35acda2 Binary files /dev/null and b/css/fonts/fontawesome-webfont.ttf differ diff --git a/css/fonts/fontawesome-webfont.woff b/css/fonts/fontawesome-webfont.woff new file mode 100644 index 0000000..400014a Binary files /dev/null and b/css/fonts/fontawesome-webfont.woff differ diff --git a/css/fonts/fontawesome-webfont.woff2 b/css/fonts/fontawesome-webfont.woff2 new file mode 100644 index 0000000..4d13fc6 Binary files /dev/null and b/css/fonts/fontawesome-webfont.woff2 differ diff --git a/css/fonts/lato-bold-italic.woff b/css/fonts/lato-bold-italic.woff new file mode 100644 index 0000000..88ad05b Binary files /dev/null and b/css/fonts/lato-bold-italic.woff differ diff --git a/css/fonts/lato-bold-italic.woff2 b/css/fonts/lato-bold-italic.woff2 new file mode 100644 index 0000000..c4e3d80 Binary files /dev/null and b/css/fonts/lato-bold-italic.woff2 differ diff --git a/css/fonts/lato-bold.woff b/css/fonts/lato-bold.woff new file mode 100644 index 0000000..c6dff51 Binary files /dev/null and b/css/fonts/lato-bold.woff differ diff --git a/css/fonts/lato-bold.woff2 b/css/fonts/lato-bold.woff2 new file mode 100644 index 0000000..bb19504 Binary files /dev/null and b/css/fonts/lato-bold.woff2 differ diff --git a/css/fonts/lato-normal-italic.woff b/css/fonts/lato-normal-italic.woff new file mode 100644 index 0000000..76114bc Binary files /dev/null and b/css/fonts/lato-normal-italic.woff differ diff --git a/css/fonts/lato-normal-italic.woff2 b/css/fonts/lato-normal-italic.woff2 new file mode 100644 index 0000000..3404f37 Binary files /dev/null and b/css/fonts/lato-normal-italic.woff2 differ diff --git a/css/fonts/lato-normal.woff b/css/fonts/lato-normal.woff new file mode 100644 index 0000000..ae1307f Binary files /dev/null and b/css/fonts/lato-normal.woff differ diff --git a/css/fonts/lato-normal.woff2 b/css/fonts/lato-normal.woff2 new file mode 100644 index 0000000..3bf9843 Binary files /dev/null and b/css/fonts/lato-normal.woff2 differ diff --git a/css/theme.css b/css/theme.css new file mode 100644 index 0000000..ad77300 --- /dev/null +++ b/css/theme.css @@ -0,0 +1,13 @@ +/* + * This file is copied from the upstream ReadTheDocs Sphinx + * theme. To aid upgradability this file should *not* be edited. + * modifications we need should be included in theme_extra.css. + * + * https://github.com/readthedocs/sphinx_rtd_theme + */ + + /* sphinx_rtd_theme version 1.2.0 | MIT license */ +html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*! + * Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome + * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) + */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs>li{display:inline-block;padding-top:5px}.wy-breadcrumbs>li.wy-breadcrumbs-aside{float:right}.rst-content .wy-breadcrumbs>li code,.rst-content .wy-breadcrumbs>li tt,.wy-breadcrumbs>li .rst-content tt,.wy-breadcrumbs>li code{all:inherit;color:inherit}.breadcrumb-item:before{content:"/";color:#bbb;font-size:13px;padding:0 6px 0 3px}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content p a{overflow-wrap:anywhere}.rst-content .wy-table td p,.rst-content .wy-table td ul,.rst-content .wy-table th p,.rst-content .wy-table th ul,.rst-content table.docutils td p,.rst-content table.docutils td ul,.rst-content table.docutils th p,.rst-content table.docutils th ul,.rst-content table.field-list td p,.rst-content table.field-list td ul,.rst-content table.field-list th p,.rst-content table.field-list th ul{font-size:inherit}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .citation-reference>span.fn-bracket,.rst-content .footnote-reference>span.fn-bracket{display:none}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:auto minmax(80%,95%)}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{display:inline-grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{display:grid;grid-template-columns:auto auto minmax(.65rem,auto) minmax(40%,95%)}html.writer-html5 .rst-content aside.citation>span.label,html.writer-html5 .rst-content aside.footnote>span.label,html.writer-html5 .rst-content div.citation>span.label{grid-column-start:1;grid-column-end:2}html.writer-html5 .rst-content aside.citation>span.backrefs,html.writer-html5 .rst-content aside.footnote>span.backrefs,html.writer-html5 .rst-content div.citation>span.backrefs{grid-column-start:2;grid-column-end:3;grid-row-start:1;grid-row-end:3}html.writer-html5 .rst-content aside.citation>p,html.writer-html5 .rst-content aside.footnote>p,html.writer-html5 .rst-content div.citation>p{grid-column-start:4;grid-column-end:5}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{margin-bottom:24px}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.citation>dt>span.brackets:before,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.citation>dt>span.brackets:after,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a{word-break:keep-all}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a:not(:first-child):before,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.citation>dd p,html.writer-html5 .rst-content dl.footnote>dd p{font-size:.9rem}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{padding-left:1rem;padding-right:1rem;font-size:.9rem;line-height:1.2rem}html.writer-html5 .rst-content aside.citation p,html.writer-html5 .rst-content aside.footnote p,html.writer-html5 .rst-content div.citation p{font-size:.9rem;line-height:1.2rem;margin-bottom:12px}html.writer-html5 .rst-content aside.citation span.backrefs,html.writer-html5 .rst-content aside.footnote span.backrefs,html.writer-html5 .rst-content div.citation span.backrefs{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content aside.citation span.backrefs>a,html.writer-html5 .rst-content aside.footnote span.backrefs>a,html.writer-html5 .rst-content div.citation span.backrefs>a{word-break:keep-all}html.writer-html5 .rst-content aside.citation span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content aside.footnote span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content div.citation span.backrefs>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content aside.citation span.label,html.writer-html5 .rst-content aside.footnote span.label,html.writer-html5 .rst-content div.citation span.label{line-height:1.2rem}html.writer-html5 .rst-content aside.citation-list,html.writer-html5 .rst-content aside.footnote-list,html.writer-html5 .rst-content div.citation-list{margin-bottom:24px}html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content aside.footnote-list aside.footnote,html.writer-html5 .rst-content div.citation-list>div.citation,html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content aside.footnote-list aside.footnote code,html.writer-html5 .rst-content aside.footnote-list aside.footnote tt,html.writer-html5 .rst-content aside.footnote code,html.writer-html5 .rst-content aside.footnote tt,html.writer-html5 .rst-content div.citation-list>div.citation code,html.writer-html5 .rst-content div.citation-list>div.citation tt,html.writer-html5 .rst-content dl.citation code,html.writer-html5 .rst-content dl.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040;overflow-wrap:normal}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl dd>ol:last-child,.rst-content dl dd>p:last-child,.rst-content dl dd>table:last-child,.rst-content dl dd>ul:last-child{margin-bottom:0}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel{border:1px solid #7fbbe3;background:#e7f2fa;font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>.kbd,.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>kbd{color:inherit;font-size:80%;background-color:#fff;border:1px solid #a6a6a6;border-radius:4px;box-shadow:0 2px grey;padding:2.4px 6px;margin:auto 0}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block} diff --git a/css/theme_extra.css b/css/theme_extra.css new file mode 100644 index 0000000..9f4b063 --- /dev/null +++ b/css/theme_extra.css @@ -0,0 +1,191 @@ +/* + * Wrap inline code samples otherwise they shoot of the side and + * can't be read at all. + * + * https://github.com/mkdocs/mkdocs/issues/313 + * https://github.com/mkdocs/mkdocs/issues/233 + * https://github.com/mkdocs/mkdocs/issues/834 + */ +.rst-content code { + white-space: pre-wrap; + word-wrap: break-word; + padding: 2px 5px; +} + +/** + * Make code blocks display as blocks and give them the appropriate + * font size and padding. + * + * https://github.com/mkdocs/mkdocs/issues/855 + * https://github.com/mkdocs/mkdocs/issues/834 + * https://github.com/mkdocs/mkdocs/issues/233 + */ +.rst-content pre code { + white-space: pre; + word-wrap: normal; + display: block; + padding: 12px; + font-size: 12px; +} + +/** + * Fix code colors + * + * https://github.com/mkdocs/mkdocs/issues/2027 + */ +.rst-content code { + color: #E74C3C; +} + +.rst-content pre code { + color: #000; + background: #f8f8f8; +} + +/* + * Fix link colors when the link text is inline code. + * + * https://github.com/mkdocs/mkdocs/issues/718 + */ +a code { + color: #2980B9; +} +a:hover code { + color: #3091d1; +} +a:visited code { + color: #9B59B6; +} + +/* + * The CSS classes from highlight.js seem to clash with the + * ReadTheDocs theme causing some code to be incorrectly made + * bold and italic. + * + * https://github.com/mkdocs/mkdocs/issues/411 + */ +pre .cs, pre .c { + font-weight: inherit; + font-style: inherit; +} + +/* + * Fix some issues with the theme and non-highlighted code + * samples. Without and highlighting styles attached the + * formatting is broken. + * + * https://github.com/mkdocs/mkdocs/issues/319 + */ +.rst-content .no-highlight { + display: block; + padding: 0.5em; + color: #333; +} + + +/* + * Additions specific to the search functionality provided by MkDocs + */ + +.search-results { + margin-top: 23px; +} + +.search-results article { + border-top: 1px solid #E1E4E5; + padding-top: 24px; +} + +.search-results article:first-child { + border-top: none; +} + +form .search-query { + width: 100%; + border-radius: 50px; + padding: 6px 12px; /* csslint allow: box-model */ + border-color: #D1D4D5; +} + +/* + * Improve inline code blocks within admonitions. + * + * https://github.com/mkdocs/mkdocs/issues/656 + */ + .rst-content .admonition code { + color: #404040; + border: 1px solid #c7c9cb; + border: 1px solid rgba(0, 0, 0, 0.2); + background: #f8fbfd; + background: rgba(255, 255, 255, 0.7); +} + +/* + * Account for wide tables which go off the side. + * Override borders to avoid weirdness on narrow tables. + * + * https://github.com/mkdocs/mkdocs/issues/834 + * https://github.com/mkdocs/mkdocs/pull/1034 + */ +.rst-content .section .docutils { + width: 100%; + overflow: auto; + display: block; + border: none; +} + +td, th { + border: 1px solid #e1e4e5 !important; /* csslint allow: important */ + border-collapse: collapse; +} + +/* + * Without the following amendments, the navigation in the theme will be + * slightly cut off. This is due to the fact that the .wy-nav-side has a + * padding-bottom of 2em, which must not necessarily align with the font-size of + * 90 % on the .rst-current-version container, combined with the padding of 12px + * above and below. These amendments fix this in two steps: First, make sure the + * .rst-current-version container has a fixed height of 40px, achieved using + * line-height, and then applying a padding-bottom of 40px to this container. In + * a second step, the items within that container are re-aligned using flexbox. + * + * https://github.com/mkdocs/mkdocs/issues/2012 + */ + .wy-nav-side { + padding-bottom: 40px; +} + +/* + * The second step of above amendment: Here we make sure the items are aligned + * correctly within the .rst-current-version container. Using flexbox, we + * achieve it in such a way that it will look like the following: + * + * [No repo_name] + * Next >> // On the first page + * << Previous Next >> // On all subsequent pages + * + * [With repo_name] + * Next >> // On the first page + * << Previous Next >> // On all subsequent pages + * + * https://github.com/mkdocs/mkdocs/issues/2012 + */ +.rst-versions .rst-current-version { + padding: 0 12px; + display: flex; + font-size: initial; + justify-content: space-between; + align-items: center; + line-height: 40px; +} + +/* + * Please note that this amendment also involves removing certain inline-styles + * from the file ./mkdocs/themes/readthedocs/versions.html. + * + * https://github.com/mkdocs/mkdocs/issues/2012 + */ +.rst-current-version span { + flex: 1; + text-align: center; +} diff --git a/img/favicon.ico b/img/favicon.ico new file mode 100644 index 0000000..e85006a Binary files /dev/null and b/img/favicon.ico differ diff --git a/index.html b/index.html new file mode 100644 index 0000000..50c461a --- /dev/null +++ b/index.html @@ -0,0 +1,198 @@ + + + + + + + + Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

Unofficial LoongArch Intrinsics Guide

+

This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:

+
    +
  • QEMU
  • +
  • GCC
  • +
  • Observations from real hardware incl. 3C5000 and 3A6000
  • +
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + + Next » + + +
+ + + + + + + + + + diff --git a/js/html5shiv.min.js b/js/html5shiv.min.js new file mode 100644 index 0000000..1a01c94 --- /dev/null +++ b/js/html5shiv.min.js @@ -0,0 +1,4 @@ +/** +* @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed +*/ +!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document); diff --git a/js/jquery-3.6.0.min.js b/js/jquery-3.6.0.min.js new file mode 100644 index 0000000..c4c6022 --- /dev/null +++ b/js/jquery-3.6.0.min.js @@ -0,0 +1,2 @@ +/*! jQuery v3.6.0 | (c) OpenJS Foundation and other contributors | jquery.org/license */ +!function(e,t){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=e.document?t(e,!0):function(e){if(!e.document)throw new Error("jQuery requires a window with a document");return t(e)}:t(e)}("undefined"!=typeof window?window:this,function(C,e){"use strict";var t=[],r=Object.getPrototypeOf,s=t.slice,g=t.flat?function(e){return t.flat.call(e)}:function(e){return t.concat.apply([],e)},u=t.push,i=t.indexOf,n={},o=n.toString,v=n.hasOwnProperty,a=v.toString,l=a.call(Object),y={},m=function(e){return"function"==typeof e&&"number"!=typeof e.nodeType&&"function"!=typeof e.item},x=function(e){return null!=e&&e===e.window},E=C.document,c={type:!0,src:!0,nonce:!0,noModule:!0};function b(e,t,n){var r,i,o=(n=n||E).createElement("script");if(o.text=e,t)for(r in c)(i=t[r]||t.getAttribute&&t.getAttribute(r))&&o.setAttribute(r,i);n.head.appendChild(o).parentNode.removeChild(o)}function w(e){return null==e?e+"":"object"==typeof e||"function"==typeof e?n[o.call(e)]||"object":typeof e}var f="3.6.0",S=function(e,t){return new S.fn.init(e,t)};function p(e){var t=!!e&&"length"in e&&e.length,n=w(e);return!m(e)&&!x(e)&&("array"===n||0===t||"number"==typeof t&&0+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp(F),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+F),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\[\\da-fA-F]{1,6}"+M+"?|\\\\([^\\r\\n\\f])","g"),ne=function(e,t){var n="0x"+e.slice(1)-65536;return t||(n<0?String.fromCharCode(n+65536):String.fromCharCode(n>>10|55296,1023&n|56320))},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(p.childNodes),p.childNodes),t[p.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&(T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!N[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&(U.test(t)||z.test(t))){(f=ee.test(t)&&ye(e.parentNode)||e)===e&&d.scope||((s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=S)),o=(l=h(t)).length;while(o--)l[o]=(s?"#"+s:":scope")+" "+xe(l[o]);c=l.join(",")}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){N(t,!0)}finally{s===S&&e.removeAttribute("id")}}}return g(t.replace($,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[S]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e&&e.namespaceURI,n=e&&(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:p;return r!=C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),p!=C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.scope=ce(function(e){return a.appendChild(e).appendChild(C.createElement("div")),"undefined"!=typeof e.querySelectorAll&&!e.querySelectorAll(":scope fieldset div").length}),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=S,!C.getElementsByName||!C.getElementsByName(S).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){var t;a.appendChild(e).innerHTML="",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+S+"-]").length||v.push("~="),(t=C.createElement("input")).setAttribute("name",""),e.appendChild(t),e.querySelectorAll("[name='']").length||v.push("\\["+M+"*name"+M+"*="+M+"*(?:''|\"\")"),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+S+"+*").length||v.push(".#.+[+~]"),e.querySelectorAll("\\\f"),v.push("[\\r\\n\\f]")}),ce(function(e){e.innerHTML="";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",F)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},j=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)==(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e==C||e.ownerDocument==p&&y(p,e)?-1:t==C||t.ownerDocument==p&&y(p,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e==C?-1:t==C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]==p?-1:s[r]==p?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if(T(e),d.matchesSelector&&E&&!N[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){N(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=m[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&m(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function j(e,n,r){return m(n)?S.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?S.grep(e,function(e){return e===n!==r}):"string"!=typeof n?S.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(S.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||D,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:q.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof S?t[0]:t,S.merge(this,S.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),N.test(r[1])&&S.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(S):S.makeArray(e,this)}).prototype=S.fn,D=S(E);var L=/^(?:parents|prev(?:Until|All))/,H={children:!0,contents:!0,next:!0,prev:!0};function O(e,t){while((e=e[t])&&1!==e.nodeType);return e}S.fn.extend({has:function(e){var t=S(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i;ce=E.createDocumentFragment().appendChild(E.createElement("div")),(fe=E.createElement("input")).setAttribute("type","radio"),fe.setAttribute("checked","checked"),fe.setAttribute("name","t"),ce.appendChild(fe),y.checkClone=ce.cloneNode(!0).cloneNode(!0).lastChild.checked,ce.innerHTML="",y.noCloneChecked=!!ce.cloneNode(!0).lastChild.defaultValue,ce.innerHTML="",y.option=!!ce.lastChild;var ge={thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?S.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;n",""]);var me=/<|&#?\w+;/;function xe(e,t,n,r,i){for(var o,a,s,u,l,c,f=t.createDocumentFragment(),p=[],d=0,h=e.length;d\s*$/g;function je(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&S(e).children("tbody")[0]||e}function De(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function qe(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Le(e,t){var n,r,i,o,a,s;if(1===t.nodeType){if(Y.hasData(e)&&(s=Y.get(e).events))for(i in Y.remove(t,"handle events"),s)for(n=0,r=s[i].length;n").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var _t,zt=[],Ut=/(=)\?(?=&|$)|\?\?/;S.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=zt.pop()||S.expando+"_"+wt.guid++;return this[e]=!0,e}}),S.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Ut.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Ut.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Ut,"$1"+r):!1!==e.jsonp&&(e.url+=(Tt.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||S.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?S(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,zt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((_t=E.implementation.createHTMLDocument("").body).innerHTML="
",2===_t.childNodes.length),S.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=N.exec(e))?[t.createElement(i[1])]:(i=xe([e],t,o),o&&o.length&&S(o).remove(),S.merge([],i.childNodes)));var r,i,o},S.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(S.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},S.expr.pseudos.animated=function(t){return S.grep(S.timers,function(e){return t===e.elem}).length},S.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=S.css(e,"position"),c=S(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=S.css(e,"top"),u=S.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,S.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},S.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){S.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===S.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===S.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=S(e).offset()).top+=S.css(e,"borderTopWidth",!0),i.left+=S.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-S.css(r,"marginTop",!0),left:t.left-i.left-S.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===S.css(e,"position"))e=e.offsetParent;return e||re})}}),S.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;S.fn[t]=function(e){return $(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),S.each(["top","left"],function(e,n){S.cssHooks[n]=Fe(y.pixelPosition,function(e,t){if(t)return t=We(e,n),Pe.test(t)?S(e).position()[n]+"px":t})}),S.each({Height:"height",Width:"width"},function(a,s){S.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){S.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return $(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?S.css(e,t,i):S.style(e,t,n,i)},s,n?e:void 0,n)}})}),S.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){S.fn[t]=function(e){return this.on(t,e)}}),S.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",n)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),S.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){S.fn[n]=function(e,t){return 0"),n("table.docutils.footnote").wrap("
"),n("table.docutils.citation").wrap("
"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n(''),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t + + + + + + + Bitwise Operations - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Bitwise Operations

+

__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvbitsel.v xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in c equals to one, copy the bit from b to dst, otherwise copy from a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseli.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in a equals to one, copy the bit from imm to dst, otherwise copy from b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitclr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitclri.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clear the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitset_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitset_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitset.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitseti.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Set the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvbitrev.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvbitrevi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Toggle the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvclo_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clo(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclo_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clo(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclo_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clo(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclo_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclo_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclo.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading ones of 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clo(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clz(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clz(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clz(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvclz_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvclz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvclz.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Count leading zeros of 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clz(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvexth_h_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_h_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.h.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[16 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_hu_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_hu_bu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.hu.bu xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[16 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_w_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_w_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.w.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_wu_hu (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_wu_hu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.wu.hu xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_d_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.d.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_du_wu (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_du_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.du.wu xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_q_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.q.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvexth_qu_du (__m256i a)

+

Synopsis

+
__m256i __lasx_xvexth_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvexth.qu.du xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextl_q_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvextl_q_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.q.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend signed 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextl_qu_du (__m256i a)

+

Synopsis

+
__m256i __lasx_xvextl_qu_du (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvextl.qu.du xr, xr
+CPU Flags: LASX
+
+

Description

+

Extend unsigned 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
int i;
+for (i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i];
+}
+for (; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 8-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 16; i++) {
+  dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+for (; i < 32; i++) {
+  dst.byte[i] =
+      (i - 16 == ((imm >> 4) & 15)) ? b.byte[(imm & 15) + 16] : a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 16-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (i - 8 == ((imm >> 4) & 7)) ? b.half[(imm & 7) + 8] : a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 32-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (i - 4 == ((imm >> 4) & 3)) ? b.word[(imm & 3) + 4] : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvextrins.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extract one 64-bit element in b and insert it to a according to imm.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+for (; i < 4; i++) {
+  dst.dword[i] =
+      (i - 2 == ((imm >> 4) & 1)) ? b.dword[(imm & 1) + 2] : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpcnt_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = popcount(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvpcnt_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = popcount(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvpcnt_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = popcount(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvpcnt_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvpcnt_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvpcnt.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Count the number of ones (population, popcount) in 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = popcount(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/branch/index.html b/lasx/branch/index.html new file mode 100644 index 0000000..369aece --- /dev/null +++ b/lasx/branch/index.html @@ -0,0 +1,691 @@ + + + + + + + + Branch - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Branch

+

int __lasx_xbz_v (__m256i a)

+

Synopsis

+
int __lasx_xbz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvseteqz.v fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a equals to zero.

+

Operation

+
dst = a.qword[0] == 0 && a.qword[1] == 0;
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbnz_v (__m256i a)

+

Synopsis

+
int __lasx_xbnz_v (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetnez.v fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a is non-zero.

+

Operation

+
dst = a.qword[0] != 0 || a.qword[1] != 0;
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbz_b (__m256i a)

+

Synopsis

+
int __lasx_xbz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.b fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 8-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 32; i++) {
+  if (a.byte[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbz_h (__m256i a)

+

Synopsis

+
int __lasx_xbz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.h fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 16-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 16; i++) {
+  if (a.half[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbz_w (__m256i a)

+

Synopsis

+
int __lasx_xbz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.w fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 32-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 8; i++) {
+  if (a.word[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbz_d (__m256i a)

+

Synopsis

+
int __lasx_xbz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetanyeqz.d fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if any 64-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 4; i++) {
+  if (a.dword[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbnz_b (__m256i a)

+

Synopsis

+
int __lasx_xbnz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.b fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 8-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 32; i++) {
+  if (a.byte[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbnz_h (__m256i a)

+

Synopsis

+
int __lasx_xbnz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.h fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 16-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 16; i++) {
+  if (a.half[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbnz_w (__m256i a)

+

Synopsis

+
int __lasx_xbnz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.w fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 32-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 8; i++) {
+  if (a.word[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lasx_xbnz_d (__m256i a)

+

Synopsis

+
int __lasx_xbnz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvsetallnez.d fcc, xr; bcnez
+CPU Flags: LASX
+
+

Description

+

Expected to be used in branches: branch if all 64-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 4; i++) {
+  if (a.dword[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_comparison/index.html b/lasx/float_comparison/index.html new file mode 100644 index 0000000..d75a8a8 --- /dev/null +++ b/lasx/float_comparison/index.html @@ -0,0 +1,2435 @@ + + + + + + + + Floating Point Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Comparison

+

__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.caf.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.ceq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cle.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.clt.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cne.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cor.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cueq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cule.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cult.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cun.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.cune.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.saf.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.seq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sle.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.slt.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sne.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sor.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sueq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sule.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sult.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sun.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcmp.sune.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_computation/index.html b/lasx/float_computation/index.html new file mode 100644 index 0000000..24d72ff --- /dev/null +++ b/lasx/float_computation/index.html @@ -0,0 +1,1439 @@ + + + + + + + + Floating Point Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Computation

+

__m256 __lasx_xvfadd_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfadd_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfadd.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add single precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256d __lasx_xvfadd_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfadd_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add double precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18
3C500011, 19.50.1
+

__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfdiv.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 21.470.25
3C50008, 16.50.08
+

__m256 __lasx_xvfmax_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmax_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmax_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmax_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmaxa.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmin_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmin_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmin_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmin_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmina_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmina_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmina.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256d __lasx_xvfmina_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmina_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmina.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfmul_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfmul_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfmul.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply single precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfmul_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfmul_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfmul.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply double precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfsub_s (__m256 a, __m256 b)

+

Synopsis

+
__m256 __lasx_xvfsub_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfsub.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256d __lasx_xvfsub_d (__m256d a, __m256d b)

+

Synopsis

+
__m256d __lasx_xvfsub_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfsub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m256 __lasx_xvflogb_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvflogb_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvflogb.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute 2-based logarithm of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = log2(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256d __lasx_xvflogb_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvflogb_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvflogb.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute 2-based logarithm of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = log2(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvfsqrt_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000150.08
3C5000270.07
+

__m256d __lasx_xvfsqrt_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfsqrt.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000360.06
3C5000360.05
+

__m256 __lasx_xvfrsqrt_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrsqrt_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000250.05
3C5000250.05
+

__m256d __lasx_xvfrsqrt_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrsqrt_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrt.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000150.04
3C5000150.04
+

__m256 __lasx_xvfrecip_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrecip_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1 / a.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000270.18
3C5000270.14
+

__m256d __lasx_xvfrecip_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrecip_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecip.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1 / a.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000230.25
3C5000230.08
+

__m256 __lasx_xvfrsqrte_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrsqrte_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+

__m256d __lasx_xvfrsqrte_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrsqrte_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrsqrte.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+

__m256 __lasx_xvfrecipe_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrecipe_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+

__m256d __lasx_xvfrecipe_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrecipe_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrecipe.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute estimated reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_conversion/index.html b/lasx/float_conversion/index.html new file mode 100644 index 0000000..b269115 --- /dev/null +++ b/lasx/float_conversion/index.html @@ -0,0 +1,2227 @@ + + + + + + + + Floating Point Conversion - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Conversion

+

__m256d __lasx_xvfcvth_d_s (__m256 a)

+

Synopsis

+
__m256d __lasx_xvfcvth_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.d.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single precision floating point elements in higher half of a to double precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp32[4 + i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256d __lasx_xvfcvtl_d_s (__m256 a)

+

Synopsis

+
__m256d __lasx_xvfcvtl_d_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.d.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single precision floating point elements in lower half of a to double precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)

+

Synopsis

+
__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.s.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double precision floating point elements in a and b to double precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    dst.fp32[i] = b.fp64[i];
+  } else {
+    dst.fp32[i] = a.fp64[i - 4];
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256 __lasx_xvfcvth_s_h (__m256i a)

+

Synopsis

+
__m256 __lasx_xvfcvth_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvth.s.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert half precision floating point elements in higher half of a to single precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp16[8 + i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256 __lasx_xvfcvtl_s_h (__m256i a)

+

Synopsis

+
__m256 __lasx_xvfcvtl_s_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvfcvtl.s.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert half precision floating point elements in lower half of a to single precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp16[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)

+

Synopsis

+
__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)
+#include <lasxintrin.h>
+Instruction: xvfcvt.h.s xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single precision floating point elements in a and b to half precision.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    dst.fp16[i] = b.fp32[i];
+  } else {
+    dst.fp16[i] = a.fp32[i - 8];
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m256d __lasx_xvffinth_d_w (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffinth_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffinth.d.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert 32-bit integer elements in higher part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256d __lasx_xvffintl_d_w (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffintl_d_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffintl.d.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert 32-bit integer elements in lower part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256d __lasx_xvffint_d_l (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffint_d_l (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.l xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert signed 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256d __lasx_xvffint_d_lu (__m256i a)

+

Synopsis

+
__m256d __lasx_xvffint_d_lu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.d.lu xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert unsigned 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvffint_s_w (__m256i a)

+

Synopsis

+
__m256 __lasx_xvffint_s_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert signed 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvffint_s_wu (__m256i a)

+

Synopsis

+
__m256 __lasx_xvffint_s_wu (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvffint.s.wu xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert unsigned 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)

+

Synopsis

+
__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvffint.s.l xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert 64-bit integer elements in a and b to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] =
+      (i < 4) ? (f32)(s32)a.dword[i]
+              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintl_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintl.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftinth_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftinth_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftinth.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrml_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrml_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrml.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrmh_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrmh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrmh.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrpl_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrpl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrpl.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrph_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrph_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrph.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrzl_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrzl_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzl.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrzh_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrzh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrzh.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrnel_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrnel_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrnel.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrneh_l_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrneh_l_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrneh.l.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftint_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftint_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftint_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrm_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrm_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrm_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrm_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrp_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrp_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrp_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrp_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrz_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrz_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrne_l_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrne_l_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.l.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrne_w_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrne_w_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_lu_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftint_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftint.lu.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_wu_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftint_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftint.wu.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_lu_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvftintrz_lu_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.lu.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftintrz_wu_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvftintrz_wu_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvftintrz.wu.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftint.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrm.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrp.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrz.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)

+

Synopsis

+
__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)
+#include <lasxintrin.h>
+Instruction: xvftintrne.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i < 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/float_misc/index.html b/lasx/float_misc/index.html new file mode 100644 index 0000000..3881deb --- /dev/null +++ b/lasx/float_misc/index.html @@ -0,0 +1,767 @@ + + + + + + + + Floating Point Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Misc

+

__m256i __lasx_xvfclass_d (__m256d a)

+

Synopsis

+
__m256i __lasx_xvfclass_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfclass.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Classifiy each double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvfclass_s (__m256 a)

+

Synopsis

+
__m256i __lasx_xvfclass_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfclass.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Classifiy each single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256 __lasx_xvfrint_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrint_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrint.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrint_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrint_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrint.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrp_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrp_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrp_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrp_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrp.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrm_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrm_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrm_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrm_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrm.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrz_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrz_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrz_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrz_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrz.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256 __lasx_xvfrintrne_s (__m256 a)

+

Synopsis

+
__m256 __lasx_xvfrintrne_s (__m256 a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.s xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256d __lasx_xvfrintrne_d (__m256d a)

+

Synopsis

+
__m256d __lasx_xvfrintrne_d (__m256d a)
+#include <lasxintrin.h>
+Instruction: xvfrintrne.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/fma/index.html b/lasx/fma/index.html new file mode 100644 index 0000000..f33fb9b --- /dev/null +++ b/lasx/fma/index.html @@ -0,0 +1,575 @@ + + + + + + + + Fused Multiply-Add - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Fused Multiply-Add

+

__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)

+

Synopsis

+
__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)

+

Synopsis

+
__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)
+#include <lasxintrin.h>
+Instruction: xvfnmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/integer_comparison/index.html b/lasx/integer_comparison/index.html new file mode 100644 index 0000000..eb4f75e --- /dev/null +++ b/lasx/integer_comparison/index.html @@ -0,0 +1,2151 @@ + + + + + + + + Integer Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Comparison

+

__m256i __lasx_xvseq_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 8-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseq_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 16-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseq_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 32-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseq_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvseq_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvseq.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the 64-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 8-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 16-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 32-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvseqi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the 64-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslt_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslt_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvslt_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvslt.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslti.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvsle_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsle_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvsle_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsle_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsle.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslei.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/integer_computation/index.html b/lasx/integer_computation/index.html new file mode 100644 index 0000000..e5ebe29 --- /dev/null +++ b/lasx/integer_computation/index.html @@ -0,0 +1,11903 @@ + + + + + + + + Integer Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Computation

+

__m256i __lasx_xvadd_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] + b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] + b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvadd_q (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadd_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadd.q xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add 128-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = a.qword[i] + b.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvabsd_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvabsd_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvabsd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvabsd.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute absolute difference of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvadda_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvadda_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvadda_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvadda_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvadda_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvadda.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add absolute of 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 8-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 16-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 32-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvaddi.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Add 64-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvavg_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavg_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvavg_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavg_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavg.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvavgr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvavgr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvavgr_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvavgr_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvavgr.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvdiv_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 320.06
3C500032, 360.05
+

__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 330.06
3C500029, 590.05
+

__m256i __lasx_xvdiv_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000170.12
3C500021.5, 220.08
+

__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 220.11
3C500017, 21.50.07
+

__m256i __lasx_xvdiv_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18
3C500011, 17.50.09
+

__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18
3C500011, 17.50.07
+

__m256i __lasx_xvdiv_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25
3C50008, 18.50.11
+

__m256i __lasx_xvdiv_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvdiv_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvdiv.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Divide unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25
3C50008, 18.50.11
+

__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a to even-positioned signed 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a to even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a to even-positioned signed 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a to even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a to even-positioned signed 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a to even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a to even-positioned signed 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhaddw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a to even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a by even-positioned signed 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.hu.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a by even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a by even-positioned signed 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.wu.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a by even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a by even-positioned signed 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.du.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a by even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a by even-positioned signed 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvhsubw.qu.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a by even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 8-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 16-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 32-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 64-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] =
+      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+                (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+                 (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+                 (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m256i __lasx_xvmax_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmax_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmax_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmax_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmax.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmaxi.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmin_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmin_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmin_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmin_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmin.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)

+

Synopsis

+
__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvmini.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m256i __lasx_xvmod_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 410.06
3C500029, 330.05
+

__m256i __lasx_xvmod_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 370.06
3C500029, 370.05
+

__m256i __lasx_xvmod_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 210.12
3C500017, 210.07
+

__m256i __lasx_xvmod_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 250.11
3C500017, 230.06
+

__m256i __lasx_xvmod_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18
3C500011, 150.08
+

__m256i __lasx_xvmod_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18
3C500011, 150.06
+

__m256i __lasx_xvmod_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25
3C50008, 100.11
+

__m256i __lasx_xvmod_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmod_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmod.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Modulo residual unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25
3C50008, 100.11
+

__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 8-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 16-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 32-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvmsub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 64-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply signed 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmuh_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmuh_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmuh.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply unsigned 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] * b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] * b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmul_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmul_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmul.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvmulwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m256i __lasx_xvneg_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 8-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = -a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvneg_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 16-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = -a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvneg_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 32-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = -a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvneg_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvneg_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvneg.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Negate 64-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = -a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsadd_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsadd_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsadd.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing add the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvssub_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssub_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssub.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Saturing subtract the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] - b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] - b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsub_q (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsub_q (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsub.q xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract 128-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = a.qword[i] - b.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 8-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 16-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 32-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsubi.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Subtract 64-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwev.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.h.bu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.w.hu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.d.wu xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsubwod.q.du xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/logical/index.html b/lasx/logical/index.html new file mode 100644 index 0000000..5b68e14 --- /dev/null +++ b/lasx/logical/index.html @@ -0,0 +1,681 @@ + + + + + + + + Logical - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Logical

+

__m256i __lasx_xvand_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvand_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvand.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise AND between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvandi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise AND between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] & imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvandn_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvandn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvandn.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise ANDN between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvnor_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvnor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvnor.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise NOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvnori.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise NOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvor_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvor.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise OR between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvori.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise OR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] | imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvorn_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvorn_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvorn.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise ORN between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvxor_v (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvxor_v (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvxor.v xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Compute bitwise XOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvxori.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute bitwise XOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] ^ imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/memory/index.html b/lasx/memory/index.html new file mode 100644 index 0000000..d4a1291 --- /dev/null +++ b/lasx/memory/index.html @@ -0,0 +1,467 @@ + + + + + + + + Memory Load & Store - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Memory Load & Store

+

__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvld xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst.

+

Operation

+
dst = memory_load(256, addr + offset);
+
+

__m256i __lasx_xvldx (void * addr, long int offset)

+

Synopsis

+
__m256i __lasx_xvldx (void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvldx xr, r, r
+CPU Flags: LASX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst.

+

Operation

+
dst = memory_load(256, addr + offset);
+
+

__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.b xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 8-bit data from memory address addr + (offset << 0), replicate the data to all vector lanes and save into dst.

+

Operation

+
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 32; i++) {
+  dst.byte[i] = data;
+}
+
+

__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.h xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 16-bit data from memory address addr + (offset << 1), replicate the data to all vector lanes and save into dst.

+

Operation

+
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 16; i++) {
+  dst.half[i] = data;
+}
+
+

__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.w xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 32-bit data from memory address addr + (offset << 2), replicate the data to all vector lanes and save into dst.

+

Operation

+
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 8; i++) {
+  dst.word[i] = data;
+}
+
+

__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)

+

Synopsis

+
__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)
+#include <lasxintrin.h>
+Instruction: xvldrepl.d xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Read 64-bit data from memory address addr + (offset << 3), replicate the data to all vector lanes and save into dst.

+

Operation

+
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 4; i++) {
+  dst.dword[i] = data;
+}
+
+

void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)

+

Synopsis

+
void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)
+#include <lasxintrin.h>
+Instruction: xvst xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Write whole vector data in data to memory address addr + offset.

+

Operation

+
memory_store(256, data, addr + offset);
+
+

void __lasx_xvstx (__m256i data, void * addr, long int offset)

+

Synopsis

+
void __lasx_xvstx (__m256i data, void * addr, long int offset)
+#include <lasxintrin.h>
+Instruction: xvstx xr, r, r
+CPU Flags: LASX
+
+

Description

+

Write whole-vector data in data to memory address addr + offset.

+

Operation

+
memory_store(256, data, addr + offset);
+
+

void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)

+

Synopsis

+
void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.b xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 8-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(8, data.byte[lane], addr + offset);
+
+

void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)

+

Synopsis

+
void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.h xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 16-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(16, data.half[lane], addr + offset);
+
+

void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)

+

Synopsis

+
void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.w xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 32-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(32, data.word[lane], addr + offset);
+
+

void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)

+

Synopsis

+
void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lasxintrin.h>
+Instruction: xvstelm.d xr, r, imm, imm
+CPU Flags: LASX
+
+

Description

+

Store the 64-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(64, data.dword[lane], addr + offset);
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/misc/index.html b/lasx/misc/index.html new file mode 100644 index 0000000..ce26736 --- /dev/null +++ b/lasx/misc/index.html @@ -0,0 +1,4945 @@ + + + + + + + + Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Misc

+

__m256i __lasx_vext2xv_h_b (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_h_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.h.b xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit lane of a to signed 16-bit elements.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_hu_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_hu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.hu.bu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit lane of a to unsigned 16-bit elements.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_w_b (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_w_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.b xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit lane of a to signed 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_wu_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_wu_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.bu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit lane of a to unsigned 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_w_h (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_w_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.w.h xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 16-bit lane of a to signed 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_wu_hu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_wu_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.wu.hu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 16-bit lane of a to unsigned 32-bit elements.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_d_b (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_d_b (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.b xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit lane of a to signed 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_du_bu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_du_bu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.bu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit lane of a to unsigned 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u8)a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_d_h (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_d_h (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.h xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 16-bit lane of a to signed 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_du_hu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_du_hu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.hu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 16-bit lane of a to unsigned 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_d_w (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_d_w (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.d.w xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend signed 32-bit lane of a to signed 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_vext2xv_du_wu (__m256i a)

+

Synopsis

+
__m256i __lasx_vext2xv_du_wu (__m256i a)
+#include <lsxintrin.h>
+Instruction: vext2xv.du.wu xr, xr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 32-bit lane of a to unsigned 64-bit elements.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvilvh_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 8-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+for (; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvh_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 16-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvh_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 32-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvh_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvh_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvh.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 64-bit elements in higher half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 8-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+for (; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 16-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+for (; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 32-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+for (; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvilvl_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvilvl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvilvl.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Interleave 64-bit elements in lower half of a and b.

+

Operation

+
int i;
+for (i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+for (; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.w xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Insert 32-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)

+

Synopsis

+
__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsgr2vr.d xr, r, imm
+CPU Flags: LASX
+
+

Description

+

Insert 64-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Insert the first 32-bit lane of b into lane indexed imm of a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == imm) ? b.word[0] : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)

+

Synopsis

+
__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvinsve0.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Insert the first 64-bit lane of b into lane indexed imm of a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[c.byte[0] % 16] = i;
+for (i = 16; i < 32; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[(c.byte[16] % 16) + 16] = i - 16;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvfrstp.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[c.half[0] % 8] = i;
+for (i = 8; i < 16; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[(c.half[8] % 8) + 8] = i - 8;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[imm % 16] = i;
+for (i = 16; i < 32; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[(imm % 16) + 16] = i - 16;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvfrstpi.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[imm % 8] = i;
+for (i = 8; i < 16; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[(imm % 8) + 8] = i - 8;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvmskgez_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskgez_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskgez.b xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 8-bit element in a, if the element is greater than or equal to zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.b xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 8-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.h xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 16-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[2] |= c << 4;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.w xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 32-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c |= c << 31;
+c >>= 62;
+dst.dword[2] |= c << 2;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmskltz_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmskltz_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmskltz.d xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 64-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+c = m & a.dword[2];
+c >>= 63;
+dst.dword[2] = c;
+c = m & a.dword[3];
+c >>= 63;
+dst.dword[2] |= c << 1;
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvmsknz_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvmsknz_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvmsknz.b xr, xr
+CPU Flags: LASX
+
+

Description

+

For each 8-bit element in a, if the element is non-zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = ~(((a.dword[2] & m) + m) | a.dword[2] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] = c;
+c = ~(((a.dword[3] & m) + m) | a.dword[3] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[2] |= c << 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackev_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackev.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack even-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpackod_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpackod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpackod.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Collect and pack odd-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 8-bit elements in b first, then pick even-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 16-bit elements in b first, then pick even-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 32-bit elements in b first, then pick even-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickev_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickev_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickev.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick even-positioned 64-bit elements in b first, then pick even-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 32-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)

+

Synopsis

+
__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 64-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)

+

Synopsis

+
__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 32-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)

+

Synopsis

+
__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)
+#include <lasxintrin.h>
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Copy one 64-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)

+

Synopsis

+
int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.w r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)

+

Synopsis

+
unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.wu r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)

+

Synopsis

+
long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.d r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)

+

Synopsis

+
unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvpickve2gr.du r, xr, imm
+CPU Flags: LASX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvpickod_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 8-bit elements in b first, then pick odd-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickod_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 16-bit elements in b first, then pick odd-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickod_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 32-bit elements in b first, then pick odd-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpickod_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvpickod_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvpickod.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Pick odd-positioned 64-bit elements in b first, then pick odd-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepli_b (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_b (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvrepli_h (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_h (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvrepli_w (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_w (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvrepli_d (imm_n512_511 imm)

+

Synopsis

+
__m256i __lasx_xvrepli_d (imm_n512_511 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = imm;
+}
+
+

Tested on real machine.

+

__m256i __lasx_xvreplgr2vr_b (int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_b (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.b xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplgr2vr_h (int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_h (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.h xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplgr2vr_w (int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_w (int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.w xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplgr2vr_d (long int val)

+

Synopsis

+
__m256i __lasx_xvreplgr2vr_d (long int val)
+#include <lasxintrin.h>
+Instruction: xvreplgr2vr.d xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m256i __lasx_xvreplve_b (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_b (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.b xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx % 16];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = a.byte[(idx % 16) + 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve_h (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_h (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.h xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx % 8];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = a.half[(idx % 8) + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve_w (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_w (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.w xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx % 4];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = a.word[(idx % 4) + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve_d (__m256i a, int idx)

+

Synopsis

+
__m256i __lasx_xvreplve_d (__m256i a, int idx)
+#include <lasxintrin.h>
+Instruction: xvreplve.d xr, xr, r
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx % 2];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = a.dword[(idx % 2) + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m256i __lasx_xvreplve0_b (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_b (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.b xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 8-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_h (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_h (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.h xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 16-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_w (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_w (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.w xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 32-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_d (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_d (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.d xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 64-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvreplve0_q (__m256i a)

+

Synopsis

+
__m256i __lasx_xvreplve0_q (__m256i a)
+#include <lasxintrin.h>
+Instruction: xvreplve0.q xr, xr
+CPU Flags: LASX
+
+

Description

+

Repeat the first 128-bit lane from a to all lanes of dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.qword[i] = a.qword[0];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx];
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = a.byte[idx + 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx];
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = a.half[idx + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx];
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = a.word[idx + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)

+

Synopsis

+
__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)
+#include <lasxintrin.h>
+Instruction: xvrepl128vei.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx];
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = a.dword[idx + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp signed 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsat.du xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Clamp unsigned 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 8-bit element in a equals to zero, set the result to zero. If the signed 8-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] =
+      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 16-bit element in a equals to zero, set the result to zero. If the signed 16-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] =
+      (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 32-bit element in a equals to zero, set the result to zero. If the signed 32-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] =
+      (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsigncov.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

If the 64-bit element in a equals to zero, set the result to zero. If the signed 64-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] =
+      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvldi (imm_n1024_1023 imm)

+

Synopsis

+
__m256i __lasx_xvldi (imm_n1024_1023 imm)
+#include <lasxintrin.h>
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+
+

Description

+

Initialize dst using predefined patterns:

+
    +
  • imm[12:10]=0b000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:10]=0b001: broadcast sign-extended imm[9:0] as 16-bit elements to all lanes
  • +
  • imm[12:10]=0b010: broadcast sign-extended imm[9:0] as 32-bit elements to all lanes
  • +
  • imm[12:10]=0b011: broadcast sign-extended imm[9:0] as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast the result as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11010: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11011: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11100: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48) as 64-bit elements to all lanes
  • +
+

Operation

+
u64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+  broadcast_value = simm9_0;
+  broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+  broadcast_value = simm9_0;
+  broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+  broadcast_value = simm9_0;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+  broadcast_value = imm7_0 << 16;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+  broadcast_value = imm7_0 << 24;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+  broadcast_value = imm7_0;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+  broadcast_value = (imm7_0 << 8) | 0xFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+  broadcast_value = (imm7_0 << 16) | 0xFFFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+                    imm7 * 0xFF00000000000000;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+  broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+                    (imm5_0 << 48);
+  broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+  for (int i = 0; i < 32; i++) {
+    dst.byte[i] = broadcast_value;
+  }
+} else if (broadcast_width == 16) {
+  for (int i = 0; i < 16; i++) {
+    dst.half[i] = broadcast_value;
+  }
+} else if (broadcast_width == 32) {
+  for (int i = 0; i < 8; i++) {
+    dst.word[i] = broadcast_value;
+  }
+} else if (broadcast_width == 64) {
+  for (int i = 0; i < 4; i++) {
+    dst.dword[i] = broadcast_value;
+  }
+}
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/permutation/index.html b/lasx/permutation/index.html new file mode 100644 index 0000000..50880e6 --- /dev/null +++ b/lasx/permutation/index.html @@ -0,0 +1,403 @@ + + + + + + + + Permutation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Permutation

+

__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Permute words from a and b with indices recorded in imm and store into dst.

+

Operation

+
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+dst.word[4] = b.word[4 + (imm & 0x3)];
+dst.word[5] = b.word[4 + ((imm >> 2) & 0x3)];
+dst.word[6] = a.word[4 + ((imm >> 4) & 0x3)];
+dst.word[7] = a.word[4 + ((imm >> 6) & 0x3)];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Permute double words from a and b with indices recorded in imm and store into dst.

+

Operation

+
dst.dword[0] = a.dword[imm & 0x3];
+dst.dword[1] = a.dword[(imm >> 2) & 0x3];
+dst.dword[2] = a.dword[(imm >> 4) & 0x3];
+dst.dword[3] = a.dword[(imm >> 6) & 0x3];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Permute quad words from a and b with indices recorded in imm and store into dst.

+

Operation

+
if ((imm & 0x4) && MACHINE_3C5000) {
+  // Caveat: observed in 3C5000
+  dst.qword[0] = 0;
+} else {
+  dst.qword[0] = (imm & 2) ? a.qword[imm & 0x1] : b.qword[imm & 0x1];
+}
+if ((imm & 0x80) && MACHINE_3C5000) {
+  // Caveat: observed in 3C5000
+  dst.qword[1] = 0;
+} else {
+  dst.qword[1] =
+      (imm & 0x20) ? a.qword[(imm >> 4) & 0x1] : b.qword[(imm >> 4) & 0x1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+

__m256i __lasx_xvperm_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvperm_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvperm.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Permute words from a with indices recorded in b and store into dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[b.word[i] % 0x8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500032
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/shift/index.html b/lasx/shift/index.html new file mode 100644 index 0000000..c814d83 --- /dev/null +++ b/lasx/shift/index.html @@ -0,0 +1,8900 @@ + + + + + + + + Shift - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shift

+

__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsll.v xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute whole vector a shifted left by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+dst.qword[1] = (u128)a.qword[1] << shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvbsrl.v xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Compute whole vector a shifted right by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+dst.qword[1] = (u128)a.qword[1] >> shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsll_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsll_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsll.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvslli.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.h.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift signed 8-bit elements in a by imm to signed 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i + 8] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.hu.bu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift unsigned 8-bit elements in a by imm to unsigned 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i + 8] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.w.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift signed 16-bit elements in a by imm to signed 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[i + 4] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.wu.hu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift unsigned 16-bit elements in a by imm to unsigned 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[i + 4] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.d.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift signed 32-bit elements in a by imm to signed 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i + 2] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsllwil.du.wu xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Extend and shift unsigned 32-bit elements in a by imm to unsigned 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i + 2] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsra_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsra_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsra_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsra_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsra_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsra.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrai.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (s8)((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] =
+      (i < 12) ? (s16)((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsran.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] =
+      (i < 6) ? (s32)((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (s8)((s16)b.half[i - 8] >> imm)
+                         : (s8)((s16)a.half[i - 16] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? (s16)((s32)b.word[i - 4] >> imm)
+                         : (s16)((s32)a.word[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+                        : (s32)((s64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? (s32)((s64)b.dword[i - 2] >> imm)
+                        : (s32)((s64)a.dword[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+                         : (s64)((s128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? (s64)((s128)b.qword[i - 1] >> imm)
+                         : (s64)((s128)a.qword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+                  (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+                  (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+                  (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrar_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrar_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrar.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] =
+        ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] =
+        ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrari.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] =
+        ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+                         (((s16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u8 shift = (b.half[i - 8] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> shift) +
+                         (((s16)a.half[i - 8] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+                          (((s32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u8 shift = (b.word[i - 4] & 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] >> shift) +
+                          (((s32)a.word[i - 4] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+                          (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u8 shift = (b.dword[i - 2] & 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> shift) +
+                          (((s64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+                         (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)b.half[i - 8] >> imm) +
+                         (((s16)b.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 16];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 16] >> imm) +
+                         (((s16)a.half[i - 16] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+                          (((s32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+                          (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i - 4] >> imm) +
+                          (((s32)b.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 8];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 8] >> imm) +
+                          (((s32)a.word[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+                          (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+                          (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i - 2] >> imm) +
+                          (((s64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 4];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 4] >> imm) +
+                          (((s64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+                           (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+                           (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i - 1] >> imm) +
+                           (((s128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 2];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 2] >> imm) +
+                           (((s128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrl_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrl_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrl_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrl_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrl_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrl.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrli.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = a.dword[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (u8)((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] =
+      (i < 12) ? (u16)((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] =
+      (i < 6) ? (u32)((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+for (int i = 16; i < 32; i++) {
+  dst.byte[i] = (i < 24) ? (u8)((u16)b.half[i - 8] >> imm)
+                         : (u8)((u16)a.half[i - 16] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+for (int i = 8; i < 16; i++) {
+  dst.half[i] = (i < 12) ? (u16)((u32)b.word[i - 4] >> imm)
+                         : (u16)((u32)a.word[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+                        : (u32)((u64)a.dword[i - 2] >> imm);
+}
+for (int i = 4; i < 8; i++) {
+  dst.word[i] = (i < 6) ? (u32)((u64)b.dword[i - 2] >> imm)
+                        : (u32)((u64)a.dword[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+                         : (u64)((u128)a.qword[i - 1] >> imm);
+}
+for (int i = 2; i < 4; i++) {
+  dst.dword[i] = (i < 3) ? (u64)((u128)b.qword[i - 1] >> imm)
+                         : (u64)((u128)a.qword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+                  ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+                  ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+                  ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlri.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+                         (((u16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u8 shift = (b.half[i - 8] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> shift) +
+                         (((u16)a.half[i - 8] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+                          (((u32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u8 shift = (b.word[i - 4] & 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] >> shift) +
+                          (((u32)a.word[i - 4] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvsrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+                          (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u8 shift = (b.dword[i - 2] & 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> shift) +
+                          (((u64)a.dword[i - 2] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+                         (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)b.half[i - 8] >> imm) +
+                         (((u16)b.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 16];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 16] >> imm) +
+                         (((u16)a.half[i - 16] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+                          (((u32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+                          (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i - 4] >> imm) +
+                          (((u32)b.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 8];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 8] >> imm) +
+                          (((u32)a.word[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+                          (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+                          (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i - 2] >> imm) +
+                          (((u64)b.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 4];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 4] >> imm) +
+                          (((u64)a.dword[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvsrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+                           (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+                           (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i - 1] >> imm) +
+                           (((u128)b.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 2];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 2] >> imm) +
+                           (((u128)a.qword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssran.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp = (s16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp = (s32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp = (s64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp = (s128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrani.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp = (s128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrarn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i - 8];
+    } else {
+      temp =
+          ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 16];
+    } else {
+      temp = ((s16)a.half[i - 16] >> imm) +
+             (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i - 8];
+    } else {
+      temp =
+          ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 16];
+    } else {
+      temp = ((s16)a.half[i - 16] >> imm) +
+             (((s16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i - 4];
+    } else {
+      temp =
+          ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 8];
+    } else {
+      temp =
+          ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i - 4];
+    } else {
+      temp =
+          ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 8];
+    } else {
+      temp =
+          ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i - 2];
+    } else {
+      temp = ((s64)b.dword[i - 2] >> imm) +
+             (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 4];
+    } else {
+      temp = ((s64)a.dword[i - 4] >> imm) +
+             (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i - 2];
+    } else {
+      temp = ((s64)b.dword[i - 2] >> imm) +
+             (((s64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 4];
+    } else {
+      temp = ((s64)a.dword[i - 4] >> imm) +
+             (((s64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i - 1];
+    } else {
+      temp = ((s128)b.qword[i - 1] >> imm) +
+             (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 2];
+    } else {
+      temp = ((s128)a.qword[i - 2] >> imm) +
+             (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrarni.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i - 1];
+    } else {
+      temp = ((s128)b.qword[i - 1] >> imm) +
+             (((s128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 2];
+    } else {
+      temp = ((s128)a.qword[i - 2] >> imm) +
+             (((s128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrln.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp = (u16)b.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 16] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp = (u32)b.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 8] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp = (u64)b.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 4] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp = (u128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlni.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp = (u128)b.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 2] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.bu.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if ((b.half[i - 8] & 15) == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +
+             (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.hu.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if ((b.word[i - 4] & 31) == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +
+             (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvssrlrn.wu.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if ((b.dword[i - 2] & 63) == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +
+             (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i - 8];
+    } else {
+      temp =
+          ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 16];
+    } else {
+      temp = ((u16)a.half[i - 16] >> imm) +
+             (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.bu.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+for (int i = 16; i < 32; i++) {
+  if (i < 24) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i - 8];
+    } else {
+      temp =
+          ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 16];
+    } else {
+      temp = ((u16)a.half[i - 16] >> imm) +
+             (((u16)a.half[i - 16] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i - 4];
+    } else {
+      temp =
+          ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 8];
+    } else {
+      temp =
+          ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.hu.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+for (int i = 8; i < 16; i++) {
+  if (i < 12) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i - 4];
+    } else {
+      temp =
+          ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 8];
+    } else {
+      temp =
+          ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i - 2];
+    } else {
+      temp = ((u64)b.dword[i - 2] >> imm) +
+             (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 4];
+    } else {
+      temp = ((u64)a.dword[i - 4] >> imm) +
+             (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.wu.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i < 8; i++) {
+  if (i < 6) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i - 2];
+    } else {
+      temp = ((u64)b.dword[i - 2] >> imm) +
+             (((u64)b.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 4];
+    } else {
+      temp = ((u64)a.dword[i - 4] >> imm) +
+             (((u64)a.dword[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i - 1];
+    } else {
+      temp = ((u128)b.qword[i - 1] >> imm) +
+             (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 2];
+    } else {
+      temp = ((u128)a.qword[i - 2] >> imm) +
+             (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)

+

Synopsis

+
__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include <lasxintrin.h>
+Instruction: xvssrlrni.du.q xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i < 4; i++) {
+  if (i < 3) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i - 1];
+    } else {
+      temp = ((u128)b.qword[i - 1] >> imm) +
+             (((u128)b.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 2];
+    } else {
+      temp = ((u128)a.qword[i - 2] >> imm) +
+             (((u128)a.qword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m256i __lasx_xvrotr_b (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_b (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.b xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] =
+      (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotr_h (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_h (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+                (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotr_w (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_w (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+                (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotr_d (__m256i a, __m256i b)

+

Synopsis

+
__m256i __lasx_xvrotr_d (__m256i a, __m256i b)
+#include <lasxintrin.h>
+Instruction: xvrotr.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+                 (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)

+

Synopsis

+
__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)
+#include <lasxintrin.h>
+Instruction: xvrotri.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lasx/shuffling/index.html b/lasx/shuffling/index.html new file mode 100644 index 0000000..7cb7f06 --- /dev/null +++ b/lasx/shuffling/index.html @@ -0,0 +1,615 @@ + + + + + + + + Shuffling - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shuffling

+

__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.b xr, xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle bytes from a and b with indices from c.

+

Caveat: the indices are placed in c, while in other vshuf intrinsics, they are placed in a.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  if ((c.byte[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.byte[i] = 0;
+  } else if ((c.byte[i] % 32) < 16) {
+    dst.byte[i] = b.byte[(c.byte[i] % 32) + ((i >= 16) ? 16 : 0)];
+  } else {
+    dst.byte[i] = a.byte[(c.byte[i] % 32) + ((i >= 16) ? 0 : -16)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.h xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle 16-bit elements in b and c with indices from a, save the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.half[i] = 0;
+  } else if ((a.half[i] % 16) < 8) {
+    dst.half[i] = c.half[(a.half[i] % 16) + ((i >= 8) ? 8 : 0)];
+  } else {
+    dst.half[i] = b.half[(a.half[i] % 16) + ((i >= 8) ? 0 : -8)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.w xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle 32-bit elements in b and c with indices from a, save the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.word[i] = 0;
+  } else if ((a.word[i] % 8) < 4) {
+    dst.word[i] = c.word[(a.word[i] % 8) + ((i >= 4) ? 4 : 0)];
+  } else {
+    dst.word[i] = b.word[(a.word[i] % 8) + ((i >= 4) ? 0 : -4)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)

+

Synopsis

+
__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)
+#include <lasxintrin.h>
+Instruction: xvshuf.d xr, xr, xr
+CPU Flags: LASX
+
+

Description

+

Shuffle 64-bit elements in b and c with indices from a, save the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.dword[i] = 0;
+  } else if ((a.dword[i] % 4) < 2) {
+    dst.dword[i] = c.dword[(a.dword[i] % 4) + ((i >= 2) ? 2 : 0)];
+  } else {
+    dst.dword[i] = b.dword[(a.dword[i] % 4) + ((i >= 2) ? 0 : -2)];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.b xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 8-bit elements in a with indices packed in imm, save the result to dst.

+

Operation

+
for (int i = 0; i < 32; i++) {
+  dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.h xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 16-bit elements in a with indices packed in imm, save the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.w xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 32-bit elements in a with indices packed in imm, save the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)

+

Synopsis

+
__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)
+#include <lasxintrin.h>
+Instruction: xvshuf4i.d xr, xr, imm
+CPU Flags: LASX
+
+

Description

+

Shuffle every four 64-bit elements in a and b with indices packed in imm, save the result to dst.

+

Operation

+
dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+    (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+dst.dword[2] = (imm & 2) ? b.dword[(imm & 1) + 2] : a.dword[(imm & 1) + 2];
+dst.dword[3] =
+    (imm & 8) ? b.dword[((imm >> 2) & 1) + 2] : a.dword[((imm >> 2) & 1) + 2];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/latency_throughput/index.html b/latency_throughput/index.html new file mode 100644 index 0000000..1543a81 --- /dev/null +++ b/latency_throughput/index.html @@ -0,0 +1,192 @@ + + + + + + + + Latency and Throughput of Instructions - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

Latency and Throughput of Instructions

+

Latency and throughput (CPI) of each instruction:

+ + + + + + + + + + + + +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/bitwise_operations/index.html b/lsx/bitwise_operations/index.html new file mode 100644 index 0000000..994c791 --- /dev/null +++ b/lsx/bitwise_operations/index.html @@ -0,0 +1,2739 @@ + + + + + + + + Bitwise Operations - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Bitwise Operations

+

__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vbitsel.v vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in c equals to one, copy the bit from b to dst, otherwise copy from a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+
Instruction3A60003C5000
LatencyThroughput (CPI)LatencyThroughput (CPI)
vabsd_b2222
vabsd_bu2222
vabsd_d2222
vabsd_du2222
vabsd_h2222
vabsd_hu2222
vabsd_w2222
vabsd_wu2222
vadd_b1412
vadd_d1412
vadd_h1412
vadd_q3232
vadd_w1412
vadda_b3232
vadda_d3232
vadda_h3232
vadda_w3232
vaddi_bu1412
vaddi_du1412
vaddi_hu1412
vaddi_wu1412
vaddwev_d_w2222
vaddwev_d_wu2222
vaddwev_d_wu_w2222
vaddwev_h_b2222
vaddwev_h_bu2222
vaddwev_h_bu_b2222
vaddwev_q_d3232
vaddwev_q_du3232
vaddwev_q_du_d3232
vaddwev_w_h2222
vaddwev_w_hu2222
vaddwev_w_hu_h2222
vaddwod_d_w2222
vaddwod_d_wu2222
vaddwod_d_wu_w2222
vaddwod_h_b2222
vaddwod_h_bu2222
vaddwod_h_bu_b2222
vaddwod_q_d3232
vaddwod_q_du3232
vaddwod_q_du_d3232
vaddwod_w_h2222
vaddwod_w_hu2222
vaddwod_w_hu_h2222
vand_v1412
vandi_b1412
vandn_v1412
vavg_b1412
vavg_bu1412
vavg_d2422
vavg_du2422
vavg_h1412
vavg_hu1412
vavg_w1412
vavg_wu1412
vavgr_b1412
vavgr_bu1412
vavgr_d2422
vavgr_du2422
vavgr_h1412
vavgr_hu1412
vavgr_w1412
vavgr_wu1412
vbitclr_b2222
vbitclr_d2222
vbitclr_h2222
vbitclr_w2222
vbitclri_b2222
vbitclri_d2222
vbitclri_h2222
vbitclri_w2222
vbitrev_b2222
vbitrev_d2222
vbitrev_h2222
vbitrev_w2222
vbitrevi_b2222
vbitrevi_d2222
vbitrevi_h2222
vbitrevi_w2222
vbitsel_v1212
vbitseli_b1212
vbitset_b2222
vbitset_d2222
vbitset_h2222
vbitset_w2222
vbitseti_b2222
vbitseti_d2222
vbitseti_h2222
vbitseti_w2222
vbsll_v1412
vbsrl_v1412
vclo_b2422
vclo_d2422
vclo_h2422
vclo_w2422
vclz_b2422
vclz_d2422
vclz_h2422
vclz_w2422
vdiv_b29, 320.0629, 320.06
vdiv_bu29, 330.0629, 590.06
vdiv_d80.258, 18.50.11
vdiv_du80.258, 18.50.11
vdiv_h170.1217, 21.50.09
vdiv_hu17, 220.1117, 21.50.07
vdiv_w110.1811, 17.50.09
vdiv_wu110.1811, 17.50.07
vext2xv_d_b3432
vext2xv_d_h3432
vext2xv_d_w3432
vext2xv_du_bu3432
vext2xv_du_hu3432
vext2xv_du_wu3432
vext2xv_h_b3432
vext2xv_hu_bu3432
vext2xv_w_b3432
vext2xv_w_h3432
vext2xv_wu_bu3432
vext2xv_wu_hu3432
vexth_d_w1412
vexth_du_wu1412
vexth_h_b1412
vexth_hu_bu1412
vexth_q_d1412
vexth_qu_du1412
vexth_w_h1412
vexth_wu_hu1412
vextl_q_d1412
vextl_qu_du1412
vextrins_b1412
vextrins_d1412
vextrins_h1412
vextrins_w1412
vfadd_d3452
vfadd_s3452
vfclass_d2422
vfclass_s2422
vfcmp_caf_d2422
vfcmp_caf_s2422
vfcmp_ceq_d2422
vfcmp_ceq_s2422
vfcmp_cle_d2422
vfcmp_cle_s2422
vfcmp_clt_d2422
vfcmp_clt_s2422
vfcmp_cne_d2422
vfcmp_cne_s2422
vfcmp_cor_d2422
vfcmp_cor_s2422
vfcmp_cueq_d2422
vfcmp_cueq_s2422
vfcmp_cule_d2422
vfcmp_cule_s2422
vfcmp_cult_d2422
vfcmp_cult_s2422
vfcmp_cun_d2422
vfcmp_cun_s2422
vfcmp_cune_d2422
vfcmp_cune_s2422
vfcmp_saf_d2422
vfcmp_saf_s2422
vfcmp_seq_d2422
vfcmp_seq_s2422
vfcmp_sle_d2422
vfcmp_sle_s2422
vfcmp_slt_d2422
vfcmp_slt_s2422
vfcmp_sne_d2422
vfcmp_sne_s2422
vfcmp_sor_d2422
vfcmp_sor_s2422
vfcmp_sueq_d2422
vfcmp_sueq_s2422
vfcmp_sule_d2422
vfcmp_sule_s2422
vfcmp_sult_d2422
vfcmp_sult_s2422
vfcmp_sun_d2422
vfcmp_sun_s2422
vfcmp_sune_d2422
vfcmp_sune_s2422
vfcvt_h_s3231
vfcvt_s_d3231
vfcvth_d_s3231
vfcvth_s_h3231
vfcvtl_d_s3231
vfcvtl_s_h3231
vfdiv_d8, 21.470.258, 170.08
vfdiv_s110.1811, 19.50.13
vffint_d_l4442
vffint_d_lu4442
vffint_s_l5251
vffint_s_w4442
vffint_s_wu4442
vffinth_d_w5251
vffintl_d_w5251
vflogb_d4442
vflogb_s4442
vfmadd_d5252
vfmadd_s5252
vfmax_d2422
vfmax_s2422
vfmaxa_d2422
vfmaxa_s2422
vfmin_d2422
vfmin_s2422
vfmina_d2422
vfmina_s2422
vfmsub_d5252
vfmsub_s5252
vfmul_d5252
vfmul_s5252
vfnmadd_d5252
vfnmadd_s5252
vfnmsub_d5252
vfnmsub_s5252
vfrecip_d80.25230.08
vfrecip_s110.18270.14
vfrint_d4242
vfrint_s4242
vfrintrm_d4242
vfrintrm_s4242
vfrintrne_d4242
vfrintrne_s4242
vfrintrp_d4242
vfrintrp_s4242
vfrintrz_d4242
vfrintrz_s4242
vfrsqrt_d150.04150.04
vfrsqrt_s170.05250.11
vfrstp_b2222
vfrstp_h2222
vfrstpi_b2222
vfrstpi_h2222
vfsqrt_d360.06360.05
vfsqrt_s110.08150.17
vfsub_d3452
vfsub_s3452
vftint_l_d4442
vftint_lu_d4442
vftint_w_d5251
vftint_w_s4442
vftint_wu_s4442
vftinth_l_s5251
vftintl_l_s5251
vftintrm_l_d4442
vftintrm_w_d5251
vftintrm_w_s4442
vftintrmh_l_s5251
vftintrml_l_s5251
vftintrne_l_d4442
vftintrne_w_d5251
vftintrne_w_s4442
vftintrneh_l_s5251
vftintrnel_l_s5251
vftintrp_l_d4442
vftintrp_w_d5251
vftintrp_w_s4442
vftintrph_l_s5251
vftintrpl_l_s5251
vftintrz_l_d4442
vftintrz_lu_d4442
vftintrz_w_d5251
vftintrz_w_s4442
vftintrz_wu_s4442
vftintrzh_l_s5251
vftintrzl_l_s5251
vhaddw_d_w2222
vhaddw_du_wu2222
vhaddw_h_b2222
vhaddw_hu_bu2222
vhaddw_q_d3232
vhaddw_qu_du3232
vhaddw_w_h2222
vhaddw_wu_hu2222
vhsubw_d_w2222
vhsubw_du_wu2222
vhsubw_h_b2222
vhsubw_hu_bu2222
vhsubw_q_d3232
vhsubw_qu_du3232
vhsubw_w_h2222
vhsubw_wu_hu2222
vilvh_b1412
vilvh_d1412
vilvh_h1412
vilvh_w1412
vilvl_b1412
vilvl_d1412
vilvl_h1412
vilvl_w1412
vinsgr2vr_b1111
vinsgr2vr_d1111
vinsgr2vr_h1111
vinsgr2vr_w1111
vmadd_b4242
vmadd_d4242
vmadd_h4242
vmadd_w4242
vmaddwev_d_w4242
vmaddwev_d_wu4242
vmaddwev_d_wu_w4242
vmaddwev_h_b4242
vmaddwev_h_bu4242
vmaddwev_h_bu_b4242
vmaddwev_q_d71.1471.14
vmaddwev_q_du71.1471.14
vmaddwev_q_du_d71.1471.14
vmaddwev_w_h4242
vmaddwev_w_hu4242
vmaddwev_w_hu_h4242
vmaddwod_d_w4242
vmaddwod_d_wu4242
vmaddwod_d_wu_w4242
vmaddwod_h_b4242
vmaddwod_h_bu4242
vmaddwod_h_bu_b4242
vmaddwod_q_d71.1471.14
vmaddwod_q_du71.1471.14
vmaddwod_q_du_d71.1471.14
vmaddwod_w_h4242
vmaddwod_w_hu4242
vmaddwod_w_hu_h4242
vmax_b1412
vmax_bu1412
vmax_d2422
vmax_du2422
vmax_h1412
vmax_hu1412
vmax_w1412
vmax_wu1412
vmaxi_b1412
vmaxi_bu1412
vmaxi_d2422
vmaxi_du2422
vmaxi_h1412
vmaxi_hu1412
vmaxi_w1412
vmaxi_wu1412
vmin_b1412
vmin_bu1412
vmin_d2422
vmin_du2422
vmin_h1412
vmin_hu1412
vmin_w1412
vmin_wu1412
vmini_b1412
vmini_bu1412
vmini_d2422
vmini_du2422
vmini_h1412
vmini_hu1412
vmini_w1412
vmini_wu1412
vmod_b29, 350.0629, 330.06
vmod_bu29, 370.0629, 330.05
vmod_d8, 100.258, 100.11
vmod_du8, 100.258, 100.11
vmod_h17, 210.1217, 210.09
vmod_hu17, 210.1117, 210.07
vmod_w11, 130.1811, 150.08
vmod_wu11, 130.1811, 150.06
vmskgez_b1412
vmskltz_b1412
vmskltz_d1412
vmskltz_h1412
vmskltz_w1412
vmsknz_b1412
vmsub_b4242
vmsub_d4242
vmsub_h4242
vmsub_w4242
vmuh_b4242
vmuh_bu4242
vmuh_d4242
vmuh_du4242
vmuh_h4242
vmuh_hu4242
vmuh_w4242
vmuh_wu4242
vmul_b4242
vmul_d4242
vmul_h4242
vmul_w4242
vmulwev_d_w4242
vmulwev_d_wu4242
vmulwev_d_wu_w4242
vmulwev_h_b4242
vmulwev_h_bu4242
vmulwev_h_bu_b4242
vmulwev_q_d7272
vmulwev_q_du7272
vmulwev_q_du_d7272
vmulwev_w_h4242
vmulwev_w_hu4242
vmulwev_w_hu_h4242
vmulwod_d_w4242
vmulwod_d_wu4242
vmulwod_d_wu_w4242
vmulwod_h_b4242
vmulwod_h_bu4242
vmulwod_h_bu_b4242
vmulwod_q_d7272
vmulwod_q_du7272
vmulwod_q_du_d7272
vmulwod_w_h4242
vmulwod_w_hu4242
vmulwod_w_hu_h4242
vneg_b1412
vneg_d1412
vneg_h1412
vneg_w1412
vnor_v1412
vnori_b1412
vor_v1412
vori_b1412
vorn_v1412
vpackev_b1412
vpackev_d1412
vpackev_h1412
vpackev_w1412
vpackod_b1412
vpackod_d1412
vpackod_h1412
vpackod_w1412
vpcnt_b2222
vpcnt_d2222
vpcnt_h2222
vpcnt_w2222
vpermi_w1412
vpickev_b1412
vpickev_d1412
vpickev_h1412
vpickev_w1412
vpickod_b1412
vpickod_d1412
vpickod_h1412
vpickod_w1412
vpickve2gr_b1111
vpickve2gr_bu1111
vpickve2gr_d1111
vpickve2gr_du1111
vpickve2gr_h1111
vpickve2gr_hu1111
vpickve2gr_w1111
vpickve2gr_wu1111
vreplgr2vr_bN/A1N/A1
vreplgr2vr_dN/A1N/A1
vreplgr2vr_hN/A1N/A1
vreplgr2vr_wN/A1N/A1
vrepli_bN/A6N/A2
vrepli_dN/A4N/A2
vrepli_hN/A4N/A2
vrepli_wN/A4N/A2
vreplve_b1111
vreplve_d1111
vreplve_h1111
vreplve_w1111
vreplvei_b1412
vreplvei_d1412
vreplvei_h1412
vreplvei_w1412
vrotr_b1422
vrotr_d1422
vrotr_h1422
vrotr_w1422
vrotri_b1422
vrotri_d1422
vrotri_h1422
vrotri_w1422
vsadd_b1412
vsadd_bu1412
vsadd_d1412
vsadd_du1412
vsadd_h1412
vsadd_hu1412
vsadd_w1412
vsadd_wu1412
vsat_b2222
vsat_bu2222
vsat_d2222
vsat_du2222
vsat_h2222
vsat_hu2222
vsat_w2222
vsat_wu2222
vseq_b1412
vseq_d1412
vseq_h1412
vseq_w1412
vseqi_b1412
vseqi_d1412
vseqi_h1412
vseqi_w1412
vsetallnez_b0.520.52
vsetallnez_d0.520.52
vsetallnez_h0.520.52
vsetallnez_w0.520.52
vsetanyeqz_b0.520.52
vsetanyeqz_d0.520.52
vsetanyeqz_h0.520.52
vsetanyeqz_w0.520.52
vseteqz_v0.520.52
vsetnez_v0.520.52
vshuf4i_b1412
vshuf4i_d1412
vshuf4i_h1412
vshuf4i_w1412
vshuf_b1212
vshuf_d1212
vshuf_h1212
vshuf_w1212
vsigncov_b1212
vsigncov_d1212
vsigncov_h1212
vsigncov_w1212
vsle_b1412
vsle_bu1412
vsle_d2422
vsle_du2422
vsle_h1412
vsle_hu1412
vsle_w1412
vsle_wu1412
vslei_b1412
vslei_bu1412
vslei_d2422
vslei_du2422
vslei_h1412
vslei_hu1412
vslei_w1412
vslei_wu1412
vsll_b1412
vsll_d1412
vsll_h1412
vsll_w1412
vslli_b1412
vslli_d1412
vslli_h1412
vslli_w1412
vsllwil_d_w2221
vsllwil_du_wu2221
vsllwil_h_b2221
vsllwil_hu_bu2221
vsllwil_w_h2221
vsllwil_wu_hu2221
vslt_b1412
vslt_bu1412
vslt_d2422
vslt_du2422
vslt_h1412
vslt_hu1412
vslt_w1412
vslt_wu1412
vslti_b1412
vslti_bu1412
vslti_d2422
vslti_du2422
vslti_h1412
vslti_hu1412
vslti_w1412
vslti_wu1412
vsra_b1412
vsra_d1412
vsra_h1412
vsra_w1412
vsrai_b1412
vsrai_d1412
vsrai_h1412
vsrai_w1412
vsran_b_h2221
vsran_h_w2221
vsran_w_d2221
vsrani_b_h4241
vsrani_d_q3232
vsrani_h_w4241
vsrani_w_d4241
vsrar_b3232
vsrar_d3232
vsrar_h3232
vsrar_w3232
vsrari_b3232
vsrari_d3232
vsrari_h3232
vsrari_w3232
vsrarn_b_h4241
vsrarn_h_w4241
vsrarn_w_d4241
vsrarni_b_h4241
vsrarni_d_q3232
vsrarni_h_w4241
vsrarni_w_d4241
vsrl_b1412
vsrl_d1412
vsrl_h1412
vsrl_w1412
vsrli_b1412
vsrli_d1412
vsrli_h1412
vsrli_w1412
vsrln_b_h2221
vsrln_h_w2221
vsrln_w_d2221
vsrlni_b_h4241
vsrlni_d_q3232
vsrlni_h_w4241
vsrlni_w_d4241
vsrlr_b3232
vsrlr_d3232
vsrlr_h3232
vsrlr_w3232
vsrlri_b3232
vsrlri_d3232
vsrlri_h3232
vsrlri_w3232
vsrlrn_b_h4241
vsrlrn_h_w4241
vsrlrn_w_d4241
vsrlrni_b_h4241
vsrlrni_d_q3232
vsrlrni_h_w4241
vsrlrni_w_d4241
vssran_b_h4241
vssran_bu_h4241
vssran_h_w4241
vssran_hu_w4241
vssran_w_d4241
vssran_wu_d4241
vssrani_b_h4241
vssrani_bu_h4241
vssrani_d_q3232
vssrani_du_q3232
vssrani_h_w4241
vssrani_hu_w4241
vssrani_w_d4241
vssrani_wu_d4241
vssrarn_b_h4241
vssrarn_bu_h4241
vssrarn_h_w4241
vssrarn_hu_w4241
vssrarn_w_d4241
vssrarn_wu_d4241
vssrarni_b_h4241
vssrarni_bu_h4241
vssrarni_d_q3232
vssrarni_du_q3232
vssrarni_h_w4241
vssrarni_hu_w4241
vssrarni_w_d4241
vssrarni_wu_d4241
vssrln_b_h4241
vssrln_bu_h4241
vssrln_h_w4241
vssrln_hu_w4241
vssrln_w_d4241
vssrln_wu_d4241
vssrlni_b_h4241
vssrlni_bu_h4241
vssrlni_d_q3232
vssrlni_du_q3232
vssrlni_h_w4241
vssrlni_hu_w4241
vssrlni_w_d4241
vssrlni_wu_d4241
vssrlrn_b_h4241
vssrlrn_bu_h4241
vssrlrn_h_w4241
vssrlrn_hu_w4241
vssrlrn_w_d4241
vssrlrn_wu_d4241
vssrlrni_b_h4241
vssrlrni_bu_h4241
vssrlrni_d_q3232
vssrlrni_du_q3232
vssrlrni_h_w4241
vssrlrni_hu_w4241
vssrlrni_w_d4241
vssrlrni_wu_d4241
vssub_b1412
vssub_bu1412
vssub_d1412
vssub_du1412
vssub_h1412
vssub_hu1412
vssub_w1412
vssub_wu1412
vsub_b1412
vsub_d1412
vsub_h1412
vsub_q3232
vsub_w1412
vsubi_bu1412
vsubi_du1412
vsubi_hu1412
vsubi_wu1412
vsubwev_d_w2222
vsubwev_d_wu2222
vsubwev_h_b2222
vsubwev_h_bu2222
vsubwev_q_d3232
vsubwev_q_du3232
vsubwev_w_h2222
vsubwev_w_hu2222
vsubwod_d_w2222
vsubwod_d_wu2222
vsubwod_h_b2222
vsubwod_h_bu2222
vsubwod_q_d3232
vsubwod_q_du3232
vsubwod_w_h2222
vsubwod_w_hu2222
vxor_v1412
vxori_b1412
xvabsd_b2222
xvabsd_bu2222
xvabsd_d2222
xvabsd_du2222
xvabsd_h2222
xvabsd_hu2222
xvabsd_w2222
xvabsd_wu2222
xvadd_b1412
xvadd_d1412
xvadd_h1412
xvadd_q3232
xvadd_w1412
xvadda_b3232
xvadda_d3232
xvadda_h3232
xvadda_w3232
xvaddi_bu1412
xvaddi_du1412
xvaddi_hu1412
xvaddi_wu1412
xvaddwev_d_w2222
xvaddwev_d_wu2222
xvaddwev_d_wu_w2222
xvaddwev_h_b2222
xvaddwev_h_bu2222
xvaddwev_h_bu_b2222
xvaddwev_q_d3232
xvaddwev_q_du3232
xvaddwev_q_du_d3232
xvaddwev_w_h2222
xvaddwev_w_hu2222
xvaddwev_w_hu_h2222
xvaddwod_d_w2222
xvaddwod_d_wu2222
xvaddwod_d_wu_w2222
xvaddwod_h_b2222
xvaddwod_h_bu2222
xvaddwod_h_bu_b2222
xvaddwod_q_d3232
xvaddwod_q_du3232
xvaddwod_q_du_d3232
xvaddwod_w_h2222
xvaddwod_w_hu2222
xvaddwod_w_hu_h2222
xvand_v1412
xvandi_b1412
xvandn_v1412
xvavg_b1412
xvavg_bu1412
xvavg_d2422
xvavg_du2422
xvavg_h1412
xvavg_hu1412
xvavg_w1412
xvavg_wu1412
xvavgr_b1412
xvavgr_bu1412
xvavgr_d2422
xvavgr_du2422
xvavgr_h1412
xvavgr_hu1412
xvavgr_w1412
xvavgr_wu1412
xvbitclr_b2222
xvbitclr_d2222
xvbitclr_h2222
xvbitclr_w2222
xvbitclri_b2222
xvbitclri_d2222
xvbitclri_h2222
xvbitclri_w2222
xvbitrev_b2222
xvbitrev_d2222
xvbitrev_h2222
xvbitrev_w2222
xvbitrevi_b2222
xvbitrevi_d2222
xvbitrevi_h2222
xvbitrevi_w2222
xvbitsel_v1212
xvbitseli_b1212
xvbitset_b2222
xvbitset_d2222
xvbitset_h2222
xvbitset_w2222
xvbitseti_b2222
xvbitseti_d2222
xvbitseti_h2222
xvbitseti_w2222
xvbsll_v1412
xvbsrl_v1412
xvclo_b2422
xvclo_d2422
xvclo_h2422
xvclo_w2422
xvclz_b2422
xvclz_d2422
xvclz_h2422
xvclz_w2422
xvdiv_b29, 320.0632, 360.05
xvdiv_bu29, 330.0629, 590.05
xvdiv_d80.258, 18.50.11
xvdiv_du80.258, 18.50.11
xvdiv_h170.1221.5, 220.08
xvdiv_hu17, 220.1117, 21.50.07
xvdiv_w110.1811, 17.50.09
xvdiv_wu110.1811, 17.50.07
xvexth_d_w1412
xvexth_du_wu1412
xvexth_h_b1412
xvexth_hu_bu1412
xvexth_q_d1412
xvexth_qu_du1412
xvexth_w_h1412
xvexth_wu_hu1412
xvextl_q_d1412
xvextl_qu_du1412
xvextrins_b1412
xvextrins_d1412
xvextrins_h1412
xvextrins_w1412
xvfadd_d3452
xvfadd_s3452
xvfclass_d2422
xvfclass_s2422
xvfcmp_caf_d2422
xvfcmp_caf_s2422
xvfcmp_ceq_d2422
xvfcmp_ceq_s2422
xvfcmp_cle_d2422
xvfcmp_cle_s2422
xvfcmp_clt_d2422
xvfcmp_clt_s2422
xvfcmp_cne_d2422
xvfcmp_cne_s2422
xvfcmp_cor_d2422
xvfcmp_cor_s2422
xvfcmp_cueq_d2422
xvfcmp_cueq_s2422
xvfcmp_cule_d2422
xvfcmp_cule_s2422
xvfcmp_cult_d2422
xvfcmp_cult_s2422
xvfcmp_cun_d2422
xvfcmp_cun_s2422
xvfcmp_cune_d2422
xvfcmp_cune_s2422
xvfcmp_saf_d2422
xvfcmp_saf_s2422
xvfcmp_seq_d2422
xvfcmp_seq_s2422
xvfcmp_sle_d2422
xvfcmp_sle_s2422
xvfcmp_slt_d2422
xvfcmp_slt_s2422
xvfcmp_sne_d2422
xvfcmp_sne_s2422
xvfcmp_sor_d2422
xvfcmp_sor_s2422
xvfcmp_sueq_d2422
xvfcmp_sueq_s2422
xvfcmp_sule_d2422
xvfcmp_sule_s2422
xvfcmp_sult_d2422
xvfcmp_sult_s2422
xvfcmp_sun_d2422
xvfcmp_sun_s2422
xvfcmp_sune_d2422
xvfcmp_sune_s2422
xvfcvt_h_s3231
xvfcvt_s_d3231
xvfcvth_d_s3231
xvfcvth_s_h3231
xvfcvtl_d_s3231
xvfcvtl_s_h3231
xvfdiv_d8, 21.470.258, 16.50.08
xvfdiv_s110.1811, 19.50.1
xvffint_d_l4442
xvffint_d_lu4442
xvffint_s_l5251
xvffint_s_w4442
xvffint_s_wu4442
xvffinth_d_w5251
xvffintl_d_w5251
xvflogb_d4442
xvflogb_s4442
xvfmadd_d5252
xvfmadd_s5252
xvfmax_d2422
xvfmax_s2422
xvfmaxa_d2422
xvfmaxa_s2422
xvfmin_d2422
xvfmin_s2422
xvfmina_d2422
xvfmina_s2422
xvfmsub_d5252
xvfmsub_s5252
xvfmul_d5252
xvfmul_s5252
xvfnmadd_d5252
xvfnmadd_s5252
xvfnmsub_d5252
xvfnmsub_s5252
xvfrecip_d230.25230.08
xvfrecip_s270.18270.14
xvfrint_d4242
xvfrint_s4242
xvfrintrm_d4242
xvfrintrm_s4242
xvfrintrne_d4242
xvfrintrne_s4242
xvfrintrp_d4242
xvfrintrp_s4242
xvfrintrz_d4242
xvfrintrz_s4242
xvfrsqrt_d150.04150.04
xvfrsqrt_s250.05250.05
xvfrstp_b2222
xvfrstp_h2222
xvfrstpi_b2222
xvfrstpi_h2222
xvfsqrt_d360.06360.05
xvfsqrt_s150.08270.07
xvfsub_d3452
xvfsub_s3452
xvftint_l_d4442
xvftint_lu_d4442
xvftint_w_d5251
xvftint_w_s4442
xvftint_wu_s4442
xvftinth_l_s5251
xvftintl_l_s5251
xvftintrm_l_d4442
xvftintrm_w_d5251
xvftintrm_w_s4442
xvftintrmh_l_s5251
xvftintrml_l_s5251
xvftintrne_l_d4442
xvftintrne_w_d5251
xvftintrne_w_s4442
xvftintrneh_l_s5251
xvftintrnel_l_s5251
xvftintrp_l_d4442
xvftintrp_w_d5251
xvftintrp_w_s4442
xvftintrph_l_s5251
xvftintrpl_l_s5251
xvftintrz_l_d4442
xvftintrz_lu_d4442
xvftintrz_w_d5251
xvftintrz_w_s4442
xvftintrz_wu_s4442
xvftintrzh_l_s5251
xvftintrzl_l_s5251
xvhaddw_d_w2222
xvhaddw_du_wu2222
xvhaddw_h_b2222
xvhaddw_hu_bu2222
xvhaddw_q_d3232
xvhaddw_qu_du3232
xvhaddw_w_h2222
xvhaddw_wu_hu2222
xvhseli_d1111
xvhsubw_d_w2222
xvhsubw_du_wu2222
xvhsubw_h_b2222
xvhsubw_hu_bu2222
xvhsubw_q_d3232
xvhsubw_qu_du3232
xvhsubw_w_h2222
xvhsubw_wu_hu2222
xvilvh_b1412
xvilvh_d1412
xvilvh_h1412
xvilvh_w1412
xvilvl_b1412
xvilvl_d1412
xvilvl_h1412
xvilvl_w1412
xvinsgr2vr_d1111
xvinsgr2vr_w1111
xvinsve0_d1412
xvinsve0_w1412
xvmadd_b4242
xvmadd_d4242
xvmadd_h4242
xvmadd_w4242
xvmaddwev_d_w4242
xvmaddwev_d_wu4242
xvmaddwev_d_wu_w4242
xvmaddwev_h_b4242
xvmaddwev_h_bu4242
xvmaddwev_h_bu_b4242
xvmaddwev_q_d71.1471.14
xvmaddwev_q_du71.1471.14
xvmaddwev_q_du_d71.1471.14
xvmaddwev_w_h4242
xvmaddwev_w_hu4242
xvmaddwev_w_hu_h4242
xvmaddwod_d_w4242
xvmaddwod_d_wu4242
xvmaddwod_d_wu_w4242
xvmaddwod_h_b4242
xvmaddwod_h_bu4242
xvmaddwod_h_bu_b4242
xvmaddwod_q_d71.1471.14
xvmaddwod_q_du71.1471.14
xvmaddwod_q_du_d71.1471.14
xvmaddwod_w_h4242
xvmaddwod_w_hu4242
xvmaddwod_w_hu_h4242
xvmax_b1412
xvmax_bu1412
xvmax_d2422
xvmax_du2422
xvmax_h1412
xvmax_hu1412
xvmax_w1412
xvmax_wu1412
xvmaxi_b1412
xvmaxi_bu1412
xvmaxi_d2422
xvmaxi_du2422
xvmaxi_h1412
xvmaxi_hu1412
xvmaxi_w1412
xvmaxi_wu1412
xvmin_b1412
xvmin_bu1412
xvmin_d2422
xvmin_du2422
xvmin_h1412
xvmin_hu1412
xvmin_w1412
xvmin_wu1412
xvmini_b1412
xvmini_bu1412
xvmini_d2422
xvmini_du2422
xvmini_h1412
xvmini_hu1412
xvmini_w1412
xvmini_wu1412
xvmod_b29, 410.0629, 330.05
xvmod_bu29, 370.0629, 370.05
xvmod_d8, 100.258, 100.11
xvmod_du8, 100.258, 100.11
xvmod_h17, 210.1217, 210.07
xvmod_hu17, 250.1117, 230.06
xvmod_w11, 130.1811, 150.08
xvmod_wu11, 130.1811, 150.06
xvmskgez_b1412
xvmskltz_b1412
xvmskltz_d1412
xvmskltz_h1412
xvmskltz_w1412
xvmsknz_b1412
xvmsub_b4242
xvmsub_d4242
xvmsub_h4242
xvmsub_w4242
xvmuh_b4242
xvmuh_bu4242
xvmuh_d4242
xvmuh_du4242
xvmuh_h4242
xvmuh_hu4242
xvmuh_w4242
xvmuh_wu4242
xvmul_b4242
xvmul_d4242
xvmul_h4242
xvmul_w4242
xvmulwev_d_w4242
xvmulwev_d_wu4242
xvmulwev_d_wu_w4242
xvmulwev_h_b4242
xvmulwev_h_bu4242
xvmulwev_h_bu_b4242
xvmulwev_q_d7272
xvmulwev_q_du7272
xvmulwev_q_du_d7272
xvmulwev_w_h4242
xvmulwev_w_hu4242
xvmulwev_w_hu_h4242
xvmulwod_d_w4242
xvmulwod_d_wu4242
xvmulwod_d_wu_w4242
xvmulwod_h_b4242
xvmulwod_h_bu4242
xvmulwod_h_bu_b4242
xvmulwod_q_d7272
xvmulwod_q_du7272
xvmulwod_q_du_d7272
xvmulwod_w_h4242
xvmulwod_w_hu4242
xvmulwod_w_hu_h4242
xvneg_b1412
xvneg_d1412
xvneg_h1412
xvneg_w1412
xvnor_v1412
xvnori_b1412
xvor_v1412
xvori_b1412
xvorn_v1412
xvpackev_b1412
xvpackev_d1412
xvpackev_h1412
xvpackev_w1412
xvpackod_b1412
xvpackod_d1412
xvpackod_h1412
xvpackod_w1412
xvpcnt_b2222
xvpcnt_d2222
xvpcnt_h2222
xvpcnt_w2222
xvperm_w3432
xvpermi_d3432
xvpermi_q32.6732
xvpermi_w1412
xvpickev_b1412
xvpickev_d1412
xvpickev_h1412
xvpickev_w1412
xvpickod_b1412
xvpickod_d1412
xvpickod_h1412
xvpickod_w1412
xvpickve2gr_d1111
xvpickve2gr_du1111
xvpickve2gr_w1111
xvpickve2gr_wu1111
xvpickve_d3432
xvpickve_w3432
xvrepl128vei_b1412
xvrepl128vei_d1412
xvrepl128vei_h1412
xvrepl128vei_w1412
xvreplgr2vr_bN/A1N/A1
xvreplgr2vr_dN/A1N/A1
xvreplgr2vr_hN/A1N/A1
xvreplgr2vr_wN/A1N/A1
xvrepli_bN/A6N/A2
xvrepli_dN/A4N/A2
xvrepli_hN/A4N/A2
xvrepli_wN/A4N/A2
xvreplve0_b3432
xvreplve0_d3432
xvreplve0_h3432
xvreplve0_q3432
xvreplve0_w3432
xvreplve_b1111
xvreplve_d1111
xvreplve_h1111
xvreplve_w1111
xvrotr_b1422
xvrotr_d1422
xvrotr_h1422
xvrotr_w1422
xvrotri_b1422
xvrotri_d1422
xvrotri_h1422
xvrotri_w1422
xvsadd_b1412
xvsadd_bu1412
xvsadd_d1412
xvsadd_du1412
xvsadd_h1412
xvsadd_hu1412
xvsadd_w1412
xvsadd_wu1412
xvsat_b2222
xvsat_bu2222
xvsat_d2222
xvsat_du2222
xvsat_h2222
xvsat_hu2222
xvsat_w2222
xvsat_wu2222
xvseq_b1412
xvseq_d1412
xvseq_h1412
xvseq_w1412
xvseqi_b1412
xvseqi_d1412
xvseqi_h1412
xvseqi_w1412
xvsetallnez_b0.520.52
xvsetallnez_d0.520.52
xvsetallnez_h0.520.52
xvsetallnez_w0.520.52
xvsetanyeqz_b0.520.52
xvsetanyeqz_d0.520.52
xvsetanyeqz_h0.520.52
xvsetanyeqz_w0.520.52
xvseteqz_v0.520.52
xvsetnez_v0.520.52
xvshuf4i_b1412
xvshuf4i_d1412
xvshuf4i_h1412
xvshuf4i_w1412
xvshuf_b1212
xvshuf_d1212
xvshuf_h1212
xvshuf_w1212
xvsigncov_b1212
xvsigncov_d1212
xvsigncov_h1212
xvsigncov_w1212
xvsle_b1412
xvsle_bu1412
xvsle_d2422
xvsle_du2422
xvsle_h1412
xvsle_hu1412
xvsle_w1412
xvsle_wu1412
xvslei_b1412
xvslei_bu1412
xvslei_d2422
xvslei_du2422
xvslei_h1412
xvslei_hu1412
xvslei_w1412
xvslei_wu1412
xvsll_b1412
xvsll_d1412
xvsll_h1412
xvsll_w1412
xvslli_b1412
xvslli_d1412
xvslli_h1412
xvslli_w1412
xvsllwil_d_w2221
xvsllwil_du_wu2221
xvsllwil_h_b2221
xvsllwil_hu_bu2221
xvsllwil_w_h2221
xvsllwil_wu_hu2221
xvslt_b1412
xvslt_bu1412
xvslt_d2422
xvslt_du2422
xvslt_h1412
xvslt_hu1412
xvslt_w1412
xvslt_wu1412
xvslti_b1412
xvslti_bu1412
xvslti_d2422
xvslti_du2422
xvslti_h1412
xvslti_hu1412
xvslti_w1412
xvslti_wu1412
xvsra_b1412
xvsra_d1412
xvsra_h1412
xvsra_w1412
xvsrai_b1412
xvsrai_d1412
xvsrai_h1412
xvsrai_w1412
xvsran_b_h2221
xvsran_h_w2221
xvsran_w_d2221
xvsrani_b_h4241
xvsrani_d_q3232
xvsrani_h_w4241
xvsrani_w_d4241
xvsrar_b3232
xvsrar_d3232
xvsrar_h3232
xvsrar_w3232
xvsrari_b3232
xvsrari_d3232
xvsrari_h3232
xvsrari_w3232
xvsrarn_b_h4241
xvsrarn_h_w4241
xvsrarn_w_d4241
xvsrarni_b_h4241
xvsrarni_d_q3232
xvsrarni_h_w4241
xvsrarni_w_d4241
xvsrl_b1412
xvsrl_d1412
xvsrl_h1412
xvsrl_w1412
xvsrli_b1412
xvsrli_d1412
xvsrli_h1412
xvsrli_w1412
xvsrln_b_h2221
xvsrln_h_w2221
xvsrln_w_d2221
xvsrlni_b_h4241
xvsrlni_d_q3232
xvsrlni_h_w4241
xvsrlni_w_d4241
xvsrlr_b3232
xvsrlr_d3232
xvsrlr_h3232
xvsrlr_w3232
xvsrlri_b3232
xvsrlri_d3232
xvsrlri_h3232
xvsrlri_w3232
xvsrlrn_b_h4241
xvsrlrn_h_w4241
xvsrlrn_w_d4241
xvsrlrni_b_h4241
xvsrlrni_d_q3232
xvsrlrni_h_w4241
xvsrlrni_w_d4241
xvssran_b_h4241
xvssran_bu_h4241
xvssran_h_w4241
xvssran_hu_w4241
xvssran_w_d4241
xvssran_wu_d4241
xvssrani_b_h4241
xvssrani_bu_h4241
xvssrani_d_q3232
xvssrani_du_q3232
xvssrani_h_w4241
xvssrani_hu_w4241
xvssrani_w_d4241
xvssrani_wu_d4241
xvssrarn_b_h4241
xvssrarn_bu_h4241
xvssrarn_h_w4241
xvssrarn_hu_w4241
xvssrarn_w_d4241
xvssrarn_wu_d4241
xvssrarni_b_h4241
xvssrarni_bu_h4241
xvssrarni_d_q3232
xvssrarni_du_q3232
xvssrarni_h_w4241
xvssrarni_hu_w4241
xvssrarni_w_d4241
xvssrarni_wu_d4241
xvssrln_b_h4241
xvssrln_bu_h4241
xvssrln_h_w4241
xvssrln_hu_w4241
xvssrln_w_d4241
xvssrln_wu_d4241
xvssrlni_b_h4241
xvssrlni_bu_h4241
xvssrlni_d_q3232
xvssrlni_du_q3232
xvssrlni_h_w4241
xvssrlni_hu_w4241
xvssrlni_w_d4241
xvssrlni_wu_d4241
xvssrlrn_b_h4241
xvssrlrn_bu_h4241
xvssrlrn_h_w4241
xvssrlrn_hu_w4241
xvssrlrn_w_d4241
xvssrlrn_wu_d4241
xvssrlrni_b_h4241
xvssrlrni_bu_h4241
xvssrlrni_d_q3232
xvssrlrni_du_q3232
xvssrlrni_h_w4241
xvssrlrni_hu_w4241
xvssrlrni_w_d4241
xvssrlrni_wu_d4241
xvssub_b1412
xvssub_bu1412
xvssub_d1412
xvssub_du1412
xvssub_h1412
xvssub_hu1412
xvssub_w1412
xvssub_wu1412
xvsub_b1412
xvsub_d1412
xvsub_h1412
xvsub_q3232
xvsub_w1412
xvsubi_bu1412
xvsubi_du1412
xvsubi_hu1412
xvsubi_wu1412
xvsubwev_d_w2222
xvsubwev_d_wu2222
xvsubwev_h_b2222
xvsubwev_h_bu2222
xvsubwev_q_d3232
xvsubwev_q_du3232
xvsubwev_w_h2222
xvsubwev_w_hu2222
xvsubwod_d_w2222
xvsubwod_d_wu2222
xvsubwod_h_b2222
xvsubwod_h_bu2222
xvsubwod_q_d3232
xvsubwod_q_du3232
xvsubwod_w_h2222
xvsubwod_w_hu2222
xvxor_v1412
xvxori_b1412
+ + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vbitseli.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise selection: for each bit position, if the bit in a equals to one, copy the bit from imm to dst, otherwise copy from b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vbitclr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitclr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] & (~((u16)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] & (~((u32)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitclri.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clear the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitset_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitset_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitset.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitseti.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Set the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrev_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vbitrev.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by elements in b from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 8-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 16-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 32-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vbitrevi.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Toggle the bit specified by imm from 64-bit elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vclo_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clo(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclo_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clo(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclo_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clo(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclo_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vclo_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclo.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading ones of 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clo(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clz(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clz(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clz(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vclz_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vclz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vclz.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Count leading zeros of 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clz(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vexth_h_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_h_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.h.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_hu_bu (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_hu_bu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.hu.bu vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 8-bit elements in the higher half of a to 16-bit.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_w_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_w_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.w.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_wu_hu (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_wu_hu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.wu.hu vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 16-bit elements in the higher half of a to 32-bit.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_d_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.d.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_du_wu (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_du_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.du.wu vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 32-bit elements in the higher half of a to 64-bit.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_q_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.q.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vexth_qu_du (__m128i a)

+

Synopsis

+
__m128i __lsx_vexth_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vexth.qu.du vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 64-bit elements in the higher half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextl_q_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vextl_q_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.q.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend signed 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextl_qu_du (__m128i a)

+

Synopsis

+
__m128i __lsx_vextl_qu_du (__m128i a)
+#include <lsxintrin.h>
+Instruction: vextl.qu.du vr, vr
+CPU Flags: LSX
+
+

Description

+

Extend unsigned 64-bit elements in the lower half of a to 128-bit.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 8-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 16-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 32-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vextrins.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extract one 64-bit element in b and insert it to a according to imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpcnt_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = popcount(a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vpcnt_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = popcount(a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vpcnt_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = popcount(a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vpcnt_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vpcnt_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vpcnt.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Count the number of ones (population, popcount) in 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = popcount(a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/branch/index.html b/lsx/branch/index.html new file mode 100644 index 0000000..9fe2ebf --- /dev/null +++ b/lsx/branch/index.html @@ -0,0 +1,701 @@ + + + + + + + + Branch - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Branch

+

int __lsx_bz_v (__m128i a)

+

Synopsis

+
int __lsx_bz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vseteqz.v fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a equals to zero.

+

Operation

+
dst = a.qword[0] == 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bnz_v (__m128i a)

+

Synopsis

+
int __lsx_bnz_v (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetnez.v fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if the whole vector a is non-zero.

+

Operation

+
dst = a.qword[0] != 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bz_b (__m128i a)

+

Synopsis

+
int __lsx_bz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.b fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 8-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 16; i++) {
+  if (a.byte[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bz_h (__m128i a)

+

Synopsis

+
int __lsx_bz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.h fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 16-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 8; i++) {
+  if (a.half[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bz_w (__m128i a)

+

Synopsis

+
int __lsx_bz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.w fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 32-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 4; i++) {
+  if (a.word[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bz_d (__m128i a)

+

Synopsis

+
int __lsx_bz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetanyeqz.d fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if any 64-bit element in a equals to zero.

+

Operation

+
dst = 0;
+for (int i = 0; i < 2; i++) {
+  if (a.dword[i] == 0) {
+    dst = 1;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bnz_b (__m128i a)

+

Synopsis

+
int __lsx_bnz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.b fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 8-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 16; i++) {
+  if (a.byte[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bnz_h (__m128i a)

+

Synopsis

+
int __lsx_bnz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.h fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 16-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 8; i++) {
+  if (a.half[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bnz_w (__m128i a)

+

Synopsis

+
int __lsx_bnz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.w fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 32-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 4; i++) {
+  if (a.word[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+

int __lsx_bnz_d (__m128i a)

+

Synopsis

+
int __lsx_bnz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vsetallnez.d fcc, vr; bcnez
+CPU Flags: LSX
+
+

Description

+

Expected to be used in branches: branch if all 64-bit elements in a are non-zero.

+

Operation

+
dst = 1;
+for (int i = 0; i < 2; i++) {
+  if (a.dword[i] == 0) {
+    dst = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60000.52
3C50000.52
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_comparison/index.html b/lsx/float_comparison/index.html new file mode 100644 index 0000000..832833c --- /dev/null +++ b/lsx/float_comparison/index.html @@ -0,0 +1,2435 @@ + + + + + + + + Floating Point Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Comparison

+

__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.caf.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.ceq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cle.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.clt.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cne.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cor.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cueq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cule.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cult.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cun.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.cune.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Do not trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.saf.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if AF(Always False), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.seq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sle.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.slt.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if LT(Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sne.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sor.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sueq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sule.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sult.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sun.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare single precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcmp.sune.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare double precision elements in a and b, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into dst. Trap for QNaN.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_computation/index.html b/lsx/float_computation/index.html new file mode 100644 index 0000000..311a1f8 --- /dev/null +++ b/lsx/float_computation/index.html @@ -0,0 +1,1443 @@ + + + + + + + + Floating Point Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Computation

+

__m128 __lsx_vfadd_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfadd_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfadd.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add single precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128d __lsx_vfadd_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfadd_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add double precision floating point elements in a to elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128 __lsx_vfdiv_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfdiv.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18
3C500011, 19.50.13
+

__m128d __lsx_vfdiv_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfdiv.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 21.470.25
3C50008, 170.08
+

__m128 __lsx_vfmax_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmax_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmax_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmax_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmaxa.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute maximum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmin_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmin_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = +(a.fp32[i], b.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmin_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmin_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = +(a.fp64[i], b.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmina_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmina_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmina.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of single precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128d __lsx_vfmina_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmina_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmina.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute minimum of double precision floating point elements in a and b by magnitude.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfmul_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfmul_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfmul.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply single precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfmul_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfmul_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfmul.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply double precision floating point elements in a and elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfsub_s (__m128 a, __m128 b)

+

Synopsis

+
__m128 __lsx_vfsub_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfsub.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract single precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128d __lsx_vfsub_d (__m128d a, __m128d b)

+

Synopsis

+
__m128d __lsx_vfsub_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfsub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract double precision floating point elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600034
3C500052
+

__m128 __lsx_vflogb_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vflogb_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vflogb.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute 2-based logarithm of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = log2(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128d __lsx_vflogb_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vflogb_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vflogb.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute 2-based logarithm of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = log2(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vfsqrt_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.08
3C5000150.17
+

__m128d __lsx_vfsqrt_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfsqrt.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000360.06
3C5000360.05
+

__m128 __lsx_vfrsqrt_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrsqrt_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000170.05
3C5000250.11
+

__m128d __lsx_vfrsqrt_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrsqrt_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrt.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000150.04
3C5000150.04
+

__m128 __lsx_vfrecip_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrecip_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecip.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1 / a.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18
3C5000270.14
+

__m128d __lsx_vfrecip_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrecip_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecip.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1 / a.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25
3C5000230.08
+

__m128 __lsx_vfrsqrte_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrsqrte_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of square root of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+
+

__m128d __lsx_vfrsqrte_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrsqrte_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrsqrte.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of square root of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+
+

__m128 __lsx_vfrecipe_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrecipe_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+
+

__m128d __lsx_vfrecipe_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrecipe_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrecipe.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute estimated reciprocal of double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_conversion/index.html b/lsx/float_conversion/index.html new file mode 100644 index 0000000..334fb11 --- /dev/null +++ b/lsx/float_conversion/index.html @@ -0,0 +1,2232 @@ + + + + + + + + Floating Point Conversion - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Conversion

+

__m128d __lsx_vfcvth_d_s (__m128 a)

+

Synopsis

+
__m128d __lsx_vfcvth_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvth.d.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single precision floating point elements in higher half of a to double precision.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp32[2 + i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128d __lsx_vfcvtl_d_s (__m128 a)

+

Synopsis

+
__m128d __lsx_vfcvtl_d_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.d.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single precision floating point elements in lower half of a to double precision.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp32[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)

+

Synopsis

+
__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vfcvt.s.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double precision floating point elements in a and b to double precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    dst.fp32[i] = b.fp64[i];
+  } else {
+    dst.fp32[i] = a.fp64[i - 2];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128 __lsx_vfcvth_s_h (__m128i a)

+

Synopsis

+
__m128 __lsx_vfcvth_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvth.s.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert half precision floating point elements in higher half of a to single precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp16[4 + i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128 __lsx_vfcvtl_s_h (__m128i a)

+

Synopsis

+
__m128 __lsx_vfcvtl_s_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vfcvtl.s.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert half precision floating point elements in lower half of a to single precision.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp16[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)

+

Synopsis

+
__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)
+#include <lsxintrin.h>
+Instruction: vfcvt.h.s vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single precision floating point elements in a and b to half precision.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    dst.fp16[i] = b.fp32[i];
+  } else {
+    dst.fp16[i] = a.fp32[i - 4];
+  }
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500031
+

__m128d __lsx_vffinth_d_w (__m128i a)

+

Synopsis

+
__m128d __lsx_vffinth_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffinth.d.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert 32-bit integer elements in higher part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128d __lsx_vffintl_d_w (__m128i a)

+

Synopsis

+
__m128d __lsx_vffintl_d_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffintl.d.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert 32-bit integer elements in lower part of a to double precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128d __lsx_vffint_d_l (__m128i a)

+

Synopsis

+
__m128d __lsx_vffint_d_l (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.l vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert signed 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128d __lsx_vffint_d_lu (__m128i a)

+

Synopsis

+
__m128d __lsx_vffint_d_lu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.d.lu vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert unsigned 64-bit integer elements in a to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vffint_s_w (__m128i a)

+

Synopsis

+
__m128 __lsx_vffint_s_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert signed 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vffint_s_wu (__m128i a)

+

Synopsis

+
__m128 __lsx_vffint_s_wu (__m128i a)
+#include <lsxintrin.h>
+Instruction: vffint.s.wu vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert unsigned 32-bit integer elements in a to single-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128 __lsx_vffint_s_l (__m128i a, __m128i b)

+

Synopsis

+
__m128 __lsx_vffint_s_l (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vffint.s.l vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert 64-bit integer elements in a and b to double-precision floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] =
+      (i < 2) ? (f32)(s32)a.dword[i]
+              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintl_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintl.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftinth_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftinth_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftinth.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrml_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrml_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrml.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrmh_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrmh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrmh.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrpl_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrpl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrpl.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrph_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrph_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrph.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrzl_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrzl_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzl.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrzh_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrzh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrzh.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrnel_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrnel_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrnel.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrneh_l_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrneh_l_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrneh.l.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftint_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftint_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftint_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrm_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrm_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrm.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrm_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrm_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrp_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrp_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrp.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrp_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrp_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrz_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrz_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrne_l_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrne_l_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrne.l.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrne_w_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrne_w_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_lu_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftint_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftint.lu.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_wu_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftint_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftint.wu.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_lu_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vftintrz_lu_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vftintrz.lu.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a to unsigned 64-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftintrz_wu_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vftintrz_wu_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vftintrz.wu.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert single-precision floating point elements in a to unsigned 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600044
3C500042
+

__m128i __lsx_vftint_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftint_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftint.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, using current rounding mode specified in fscr.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrm.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards negative infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrp.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards positive infinity.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrz.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards zero.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+

__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)

+

Synopsis

+
__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)
+#include <lsxintrin.h>
+Instruction: vftintrne.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards nearest even.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500051
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/float_misc/index.html b/lsx/float_misc/index.html new file mode 100644 index 0000000..3f8480c --- /dev/null +++ b/lsx/float_misc/index.html @@ -0,0 +1,767 @@ + + + + + + + + Floating Point Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Floating Point Misc

+

__m128i __lsx_vfclass_d (__m128d a)

+

Synopsis

+
__m128i __lsx_vfclass_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfclass.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Classifiy each double precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = fp_classify(a.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vfclass_s (__m128 a)

+

Synopsis

+
__m128i __lsx_vfclass_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfclass.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Classifiy each single precision floating point elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = fp_classify(a.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128 __lsx_vfrint_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrint_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrint.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrint_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrint_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrint.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, using current rounding mode specified in fscr, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrp_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrp_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrp_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrp_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrp.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards positive infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrm_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrm_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrm_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrm_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrm.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards negative infinity, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrz_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrz_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrz_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrz_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrz.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards zero, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128 __lsx_vfrintrne_s (__m128 a)

+

Synopsis

+
__m128 __lsx_vfrintrne_s (__m128 a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.s vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128d __lsx_vfrintrne_d (__m128d a)

+

Synopsis

+
__m128d __lsx_vfrintrne_d (__m128d a)
+#include <lsxintrin.h>
+Instruction: vfrintrne.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Round single-precision floating point elements in a to integers, rounding towards nearest even, and store as floating point numbers.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/fma/index.html b/lsx/fma/index.html new file mode 100644 index 0000000..b2e2ee3 --- /dev/null +++ b/lsx/fma/index.html @@ -0,0 +1,575 @@ + + + + + + + + Fused Multiply-Add - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Fused Multiply-Add

+

__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)

+

Synopsis

+
__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+

__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)

+

Synopsis

+
__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include <lsxintrin.h>
+Instruction: vfnmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+
+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600052
3C500052
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/integer_comparison/index.html b/lsx/integer_comparison/index.html new file mode 100644 index 0000000..aa080f3 --- /dev/null +++ b/lsx/integer_comparison/index.html @@ -0,0 +1,2151 @@ + + + + + + + + Integer Comparison - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Comparison

+

__m128i __lsx_vseq_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 8-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseq_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 16-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseq_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 32-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseq_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vseq_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vseq.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the 64-bit elements in a and b, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 8-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 16-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 32-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vseqi.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the 64-bit elements in a and imm, store all-ones to dst if equal, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslt_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslt_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vslt_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vslt.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslti.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslti.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and imm, store all-ones to dst if corresponding element in a is less than b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vsle_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsle_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vsle_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsle_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsle.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 8-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 16-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 32-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vslei.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the signed 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslei.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compare the unsigned 64-bit elements in a and b, store all-ones to dst if corresponding element in a is less than or equal b, zero otherwise.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/integer_computation/index.html b/lsx/integer_computation/index.html new file mode 100644 index 0000000..21b0fb7 --- /dev/null +++ b/lsx/integer_computation/index.html @@ -0,0 +1,11899 @@ + + + + + + + + Integer Computation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Integer Computation

+

__m128i __lsx_vadd_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] + b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] + b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] + b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] + b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vadd_q (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add 128-bit elements in a and b, save the result in dst.

+

Operation

+
dst.qword[0] = a.qword[0] + b.qword[0];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vabsd_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vabsd_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute absolute difference of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vadda_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vadda_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vadda_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vadda_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add absolute of 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 8-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 16-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 32-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Add 64-bit elements in a and imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] + imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vavg_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] & b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] & b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] & b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavg_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vavg_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] & b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vavgr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
+                ((a.byte[i] | b.byte[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
+                ((a.half[i] | b.half[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
+                ((a.word[i] | b.word[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vavgr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of signed 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vavgr_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vavgr_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vavgr.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
+                 ((a.dword[i] | b.dword[i]) & 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vdiv_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 320.06
3C500029, 320.06
+

__m128i __lsx_vdiv_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 330.06
3C500029, 590.06
+

__m128i __lsx_vdiv_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000170.12
3C500017, 21.50.09
+

__m128i __lsx_vdiv_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 220.11
3C500017, 21.50.07
+

__m128i __lsx_vdiv_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18
3C500011, 17.50.09
+

__m128i __lsx_vdiv_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000110.18
3C500011, 17.50.07
+

__m128i __lsx_vdiv_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25
3C50008, 18.50.11
+

__m128i __lsx_vdiv_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vdiv_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vdiv.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Divide unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600080.25
3C50008, 18.50.11
+

__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 8-bit elements in a to even-positioned signed 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 8-bit elements in a to even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 16-bit elements in a to even-positioned signed 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 16-bit elements in a to even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 32-bit elements in a to even-positioned signed 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 32-bit elements in a to even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned signed 64-bit elements in a to even-positioned signed 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhaddw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Add odd-positioned unsigned 64-bit elements in a to even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a by even-positioned signed 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.hu.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a by even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a by even-positioned signed 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.wu.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a by even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a by even-positioned signed 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.du.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a by even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a by even-positioned signed 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vhsubw.qu.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a by even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 8-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 16-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 32-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 64-bit elements in b and c, add to elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] =
+      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+                (u32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+                 (u64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+                 (u128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600071.14
3C500071.14
+

__m128i __lsx_vmax_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmax_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmax_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmax_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmax.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmaxi.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise maximum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmin_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmin_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmin_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmin_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmin.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 8-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 16-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 32-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)

+

Synopsis

+
__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)
+#include <lsxintrin.h>
+Instruction: vmini.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for signed 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vmini.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute elementwise minimum for unsigned 64-bit elements in a and imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600024
3C500022
+

__m128i __lsx_vmod_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 350.06
3C500029, 330.06
+

__m128i __lsx_vmod_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 8-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600029, 370.06
3C500029, 330.05
+

__m128i __lsx_vmod_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 210.12
3C500017, 210.09
+

__m128i __lsx_vmod_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 16-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600017, 210.11
3C500017, 210.07
+

__m128i __lsx_vmod_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18
3C500011, 150.08
+

__m128i __lsx_vmod_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 32-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011, 130.18
3C500011, 150.06
+

__m128i __lsx_vmod_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual signed 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25
3C50008, 100.11
+

__m128i __lsx_vmod_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmod_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmod.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Modulo residual unsigned 64-bit elements in a by elements in b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A60008, 100.25
3C50008, 100.11
+

__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 8-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 16-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 32-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vmsub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 64-bit elements in b and c, negate and add elements in a, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 8-bit elements in a and b, save the high 8-bit result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 16-bit elements in a and b, save the high 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 32-bit elements in a and b, save the high 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply signed 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmuh_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmuh_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmuh.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply unsigned 64-bit elements in a and b, save the high 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] * b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] * b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] * b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmul_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmul_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmul.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] * b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500042
+

__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vmulwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Multiply odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600072
3C500072
+

__m128i __lsx_vneg_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.b vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 8-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = -a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vneg_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.h vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 16-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = -a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vneg_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.w vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 32-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = -a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vneg_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vneg_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vneg.d vr, vr
+CPU Flags: LSX
+
+

Description

+

Negate 64-bit elements in a and save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = -a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsadd_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsadd_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsadd.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing add the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 8-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 16-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 32-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the signed 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vssub_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssub_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssub.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Saturing subtract the unsigned 64-bit elements in a and b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 8-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] - b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 16-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] - b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 32-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] - b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 64-bit elements in a and b, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] - b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsub_q (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsub_q (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsub.q vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract 128-bit elements in a and b, save the result in dst.

+

Operation

+
dst.qword[0] = a.qword[0] - b.qword[0];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 8-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 16-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 32-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsubi.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Subtract 64-bit elements in a by imm, save the result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] - imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwev.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.h.bu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.w.hu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.d.wu vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsubwod.q.du vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Subtract odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

+

Operation

+
for (int i = 0; i < 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/logical/index.html b/lsx/logical/index.html new file mode 100644 index 0000000..001c1f5 --- /dev/null +++ b/lsx/logical/index.html @@ -0,0 +1,681 @@ + + + + + + + + Logical - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Logical

+

__m128i __lsx_vand_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise AND between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] & b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise AND between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] & imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vandn_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise ANDN between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = b.dword[i] & (~a.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vnor_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vnor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vnor.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise NOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vnori.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise NOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ~(a.byte[i] | imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vor_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vor.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise OR between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vori_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vori.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise OR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] | imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vorn_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vorn_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vorn.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise ORN between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vxor_v (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vxor_v (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vxor.v vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Compute bitwise XOR between elements in a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vxori.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute bitwise XOR between elements in a and imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] ^ imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/memory/index.html b/lsx/memory/index.html new file mode 100644 index 0000000..86038fe --- /dev/null +++ b/lsx/memory/index.html @@ -0,0 +1,467 @@ + + + + + + + + Memory Load & Store - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Memory Load & Store

+

__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst.

+

Operation

+
dst = memory_load(128, addr + offset);
+
+

__m128i __lsx_vldx (void * addr, long int offset)

+

Synopsis

+
__m128i __lsx_vldx (void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+
+

Description

+

Read whole vector from memory address addr + offset, save the data into dst.

+

Operation

+
dst = memory_load(128, addr + offset);
+
+

__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 8-bit data from memory address addr + (offset << 0), replicate the data to all vector lanes and save into dst.

+

Operation

+
u8 data = memory_load(8, addr + offset);
+for (int i = 0; i < 16; i++) {
+  dst.byte[i] = data;
+}
+
+

__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 16-bit data from memory address addr + (offset << 1), replicate the data to all vector lanes and save into dst.

+

Operation

+
u16 data = memory_load(16, addr + (offset << 1));
+for (int i = 0; i < 8; i++) {
+  dst.half[i] = data;
+}
+
+

__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 32-bit data from memory address addr + (offset << 2), replicate the data to all vector lanes and save into dst.

+

Operation

+
u32 data = memory_load(32, addr + (offset << 2));
+for (int i = 0; i < 4; i++) {
+  dst.word[i] = data;
+}
+
+

__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)

+

Synopsis

+
__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include <lsxintrin.h>
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Read 64-bit data from memory address addr + (offset << 3), replicate the data to all vector lanes and save into dst.

+

Operation

+
u64 data = memory_load(64, addr + (offset << 3));
+for (int i = 0; i < 2; i++) {
+  dst.dword[i] = data;
+}
+
+

void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)

+

Synopsis

+
void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)
+#include <lsxintrin.h>
+Instruction: vst vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Write whole vector data in data to memory address addr + offset.

+

Operation

+
memory_store(128, data, addr + offset);
+
+

void __lsx_vstx (__m128i data, void * addr, long int offset)

+

Synopsis

+
void __lsx_vstx (__m128i data, void * addr, long int offset)
+#include <lsxintrin.h>
+Instruction: vstx vr, r, r
+CPU Flags: LSX
+
+

Description

+

Write whole-vector data in data to memory address addr + offset.

+

Operation

+
memory_store(128, data, addr + offset);
+
+

void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)

+

Synopsis

+
void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.b vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 8-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(8, data.byte[lane], addr + offset);
+
+

void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)

+

Synopsis

+
void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.h vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 16-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(16, data.half[lane], addr + offset);
+
+

void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)

+

Synopsis

+
void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.w vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 32-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(32, data.word[lane], addr + offset);
+
+

void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)

+

Synopsis

+
void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)
+#include <lsxintrin.h>
+Instruction: vstelm.d vr, r, imm, imm
+CPU Flags: LSX
+
+

Description

+

Store the 64-bit element in data specified by lane to memory address addr + offset.

+

Operation

+
memory_store(64, data.dword[lane], addr + offset);
+
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/misc/index.html b/lsx/misc/index.html new file mode 100644 index 0000000..cac5719 --- /dev/null +++ b/lsx/misc/index.html @@ -0,0 +1,3927 @@ + + + + + + + + Misc - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Misc

+

__m128i __lsx_vilvh_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 8-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvh_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 16-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvh_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 32-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvh_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvh_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvh.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 64-bit elements in higher half of a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 8-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 16-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 32-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vilvl_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vilvl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vilvl.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Interleave 64-bit elements in lower half of a and b.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.b vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 8-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i == imm) ? b : a.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.h vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 16-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i == imm) ? b : a.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.w vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 32-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i == imm) ? b : a.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)

+

Synopsis

+
__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
+#include <lsxintrin.h>
+Instruction: vinsgr2vr.d vr, r, imm
+CPU Flags: LSX
+
+

Description

+

Insert 64-bit element into lane indexed imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[c.byte[0] % 16] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vfrstp.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by c.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[c.half[0] % 8] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i < 16; i++) {
+  if ((s8)b.byte[i] < 0) {
+    break;
+  }
+}
+dst.byte[imm % 16] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vfrstpi.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i < 8; i++) {
+  if ((s16)b.half[i] < 0) {
+    break;
+  }
+}
+dst.half[imm % 8] = i;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vmskgez_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskgez_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskgez.b vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 8-bit element in a, if the element is greater than or equal to zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.b vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 8-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8080808080808080;
+u64 c = m & a.dword[0];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_h (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_h (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.h vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 16-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8000800080008000;
+u64 c = m & a.dword[0];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 15;
+c |= c << 30;
+c >>= 60;
+dst.dword[0] |= c << 4;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_w (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_w (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.w vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 32-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8000000080000000;
+u64 c = m & a.dword[0];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c |= c << 31;
+c >>= 62;
+dst.dword[0] |= c << 2;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmskltz_d (__m128i a)

+

Synopsis

+
__m128i __lsx_vmskltz_d (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmskltz.d vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 64-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x8000000000000000;
+u64 c = m & a.dword[0];
+c >>= 63;
+dst.dword[0] = c;
+c = m & a.dword[1];
+c >>= 63;
+dst.dword[0] |= c << 1;
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vmsknz_b (__m128i a)

+

Synopsis

+
__m128i __lsx_vmsknz_b (__m128i a)
+#include <lsxintrin.h>
+Instruction: vmsknz.b vr, vr
+CPU Flags: LSX
+
+

Description

+

For each 8-bit element in a, if the element is non-zero, set one bit in dst, otherwise clear it.

+

Operation

+
u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
+c |= c << 7;
+c |= c << 14;
+c |= c << 28;
+c >>= 56;
+dst.dword[0] |= c << 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackev_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackev.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack even-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 8-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 16-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 32-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpackod_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpackod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpackod.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Collect and pack odd-positioned 64-bit elements in a and b and store dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 8-bit elements in b first, then pick even-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 16-bit elements in b first, then pick even-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 32-bit elements in b first, then pick even-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickev_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickev_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickev.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick even-positioned 64-bit elements in b first, then pick even-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)

+

Synopsis

+
int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.b r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s8)a.byte[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)

+

Synopsis

+
unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.bu r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u8)a.byte[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)

+

Synopsis

+
int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.h r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s16)a.half[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)

+

Synopsis

+
unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.hu r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u16)a.half[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)

+

Synopsis

+
int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.w r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)

+

Synopsis

+
unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.wu r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u32)a.word[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)

+

Synopsis

+
long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.d r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (s64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)

+

Synopsis

+
unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vpickve2gr.du r, vr, imm
+CPU Flags: LSX
+
+

Description

+

Pick the lane specified by idx from a and store into dst.

+

Operation

+
dst = (u64)a.dword[idx];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vpickod_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 8-bit elements in b first, then pick odd-positioned 8-bit elements in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickod_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 16-bit elements in b first, then pick odd-positioned 16-bit elements in a.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickod_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 32-bit elements in b first, then pick odd-positioned 32-bit elements in a.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vpickod_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vpickod_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vpickod.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Pick odd-positioned 64-bit elements in b first, then pick odd-positioned 64-bit elements in a.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vrepli_b (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_b (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vrepli_h (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_h (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vrepli_w (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_w (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vrepli_d (imm_n512_511 imm)

+

Synopsis

+
__m128i __lsx_vrepli_d (imm_n512_511 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat imm to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = imm;
+}
+
+

Tested on real machine.

+

__m128i __lsx_vreplgr2vr_b (int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_b (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.b vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplgr2vr_h (int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_h (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.h vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplgr2vr_w (int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_w (int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.w vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplgr2vr_d (long int val)

+

Synopsis

+
__m128i __lsx_vreplgr2vr_d (long int val)
+#include <lsxintrin.h>
+Instruction: vreplgr2vr.d vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat val to whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = val;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A6000N/A1
3C5000N/A1
+

__m128i __lsx_vreplve_b (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_b (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.b vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx % 16];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplve_h (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_h (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.h vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx % 8];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplve_w (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_w (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.w vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx % 4];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplve_d (__m128i a, int idx)

+

Synopsis

+
__m128i __lsx_vreplve_d (__m128i a, int idx)
+#include <lsxintrin.h>
+Instruction: vreplve.d vr, vr, r
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx % 2];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600011
3C500011
+

__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)

+

Synopsis

+
__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
+#include <lsxintrin.h>
+Instruction: vreplvei.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Repeat the element in lane idx of a to fill whole vector.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[idx];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsat.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 8-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsat.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 16-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsat.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 32-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp signed 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsat.du vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Clamp unsigned 64-bit elements in a to range specified by imm.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500022
+

__m128i __lsx_vsigncov_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 8-bit element in a equals to zero, set the result to zero. If the signed 8-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vsigncov_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 16-bit element in a equals to zero, set the result to zero. If the signed 16-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vsigncov_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 32-bit element in a equals to zero, set the result to zero. If the signed 32-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] =
+      (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vsigncov_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsigncov.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

If the 64-bit element in a equals to zero, set the result to zero. If the signed 64-bit element in a is posiive, copy element in b to result. Otherwise, copy negated element in b to result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] =
+      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vldi (imm_n1024_1023 imm)

+

Synopsis

+
__m128i __lsx_vldi (imm_n1024_1023 imm)
+#include <lsxintrin.h>
+Instruction: vldi vr, imm
+CPU Flags: LSX
+
+

Description

+

Initialize dst using predefined patterns:

+
    +
  • imm[12:10]=0b000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:10]=0b001: broadcast sign-extended imm[9:0] as 16-bit elements to all lanes
  • +
  • imm[12:10]=0b010: broadcast sign-extended imm[9:0] as 32-bit elements to all lanes
  • +
  • imm[12:10]=0b011: broadcast sign-extended imm[9:0] as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
  • +
  • imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
  • +
  • imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast the result as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11010: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 32-bit elements to all lanes
  • +
  • imm[12:8]=0b11011: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 64-bit elements to all lanes
  • +
  • imm[12:8]=0b11100: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48) as 64-bit elements to all lanes
  • +
+

Operation

+
u64 imm12_10 = (imm >> 10) & 0b111;
+u64 imm12_8 = (imm >> 8) & 0b11111;
+u64 imm9_0 = imm & 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
+u64 imm7_0 = imm & 0xFF;
+u64 imm7 = (imm >> 7) & 0x1;
+u64 imm6 = (imm >> 6) & 0x1;
+u64 imm5 = (imm >> 5) & 0x1;
+u64 imm5_0 = imm & 0x3F;
+u64 imm4 = (imm >> 4) & 0x1;
+u64 imm3 = (imm >> 3) & 0x1;
+u64 imm2 = (imm >> 2) & 0x1;
+u64 imm1 = (imm >> 1) & 0x1;
+u64 imm0 = imm & 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+  broadcast_value = simm9_0;
+  broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+  broadcast_value = simm9_0;
+  broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+  broadcast_value = simm9_0;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+  broadcast_value = imm7_0 << 16;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+  broadcast_value = imm7_0 << 24;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+  broadcast_value = imm7_0;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+  broadcast_value = imm7_0 << 8;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+  broadcast_value = (imm7_0 << 8) | 0xFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+  broadcast_value = (imm7_0 << 16) | 0xFFFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+                    imm7 * 0xFF00000000000000;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
+                    (imm5_0 << 19);
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+  broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
+                    (imm5_0 << 48);
+  broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+  for (int i = 0; i < 16; i++) {
+    dst.byte[i] = broadcast_value;
+  }
+} else if (broadcast_width == 16) {
+  for (int i = 0; i < 8; i++) {
+    dst.half[i] = broadcast_value;
+  }
+} else if (broadcast_width == 32) {
+  for (int i = 0; i < 4; i++) {
+    dst.word[i] = broadcast_value;
+  }
+} else if (broadcast_width == 64) {
+  for (int i = 0; i < 2; i++) {
+    dst.dword[i] = broadcast_value;
+  }
+}
+
+

Tested on real machine.

+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/permutation/index.html b/lsx/permutation/index.html new file mode 100644 index 0000000..867a70e --- /dev/null +++ b/lsx/permutation/index.html @@ -0,0 +1,241 @@ + + + + + + + + Permutation - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Permutation

+

__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Permute words from a and b with indices recorded in imm and store into dst.

+

Operation

+
dst.word[0] = b.word[imm & 0x3];
+dst.word[1] = b.word[(imm >> 2) & 0x3];
+dst.word[2] = a.word[(imm >> 4) & 0x3];
+dst.word[3] = a.word[(imm >> 6) & 0x3];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/shift/index.html b/lsx/shift/index.html new file mode 100644 index 0000000..7198eee --- /dev/null +++ b/lsx/shift/index.html @@ -0,0 +1,7868 @@ + + + + + + + + Shift - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shift

+

__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsll.v vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute whole vector a shifted left by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] << shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vbsrl.v vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Compute whole vector a shifted right by imm * 8 bits.

+

Operation

+
int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] >> shift;
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] << (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsll_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsll_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsll.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vslli.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vslli.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vslli.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vslli.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical left shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.h.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift signed 8-bit elements in a by imm to signed 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.hu.bu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift unsigned 8-bit elements in a by imm to unsigned 16-bit result.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.w.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift signed 16-bit elements in a by imm to signed 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.wu.hu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift unsigned 16-bit elements in a by imm to unsigned 32-bit result.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.d.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift signed 32-bit elements in a by imm to signed 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsllwil.du.wu vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Extend and shift unsigned 32-bit elements in a by imm to unsigned 64-bit result.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i] << imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsra_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsra_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsra_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsra_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsra_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsra.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = ((s16)a.half[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = ((s32)a.word[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrai.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsran_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsran_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsran_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsran.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
+                        : (s32)((s64)a.dword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
+                         : (s64)((s128)a.qword[i - 1] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
+                  (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
+                  (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
+                  (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrar_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrar_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrar.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] =
+        ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] =
+        ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrari.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] =
+        ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
+                         (((s16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
+                          (((s32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
+                          (((s64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
+                         (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
+                          (((s32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
+                          (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
+                          (((s64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
+                          (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
+                           (((s128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
+                           (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrl_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrl_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrl_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrl_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrl_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrl.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrli.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = a.dword[i] >> imm;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600022
3C500021
+

__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] =
+      (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
+                        : (u32)((u64)a.dword[i - 2] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
+                         : (u64)((u128)a.qword[i - 1] >> imm);
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if ((b.byte[i] & 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
+                  ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((b.half[i] & 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
+                  ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((b.word[i] & 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
+                  ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if ((b.dword[i] & 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
+                   ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlri.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u8 shift = (b.half[i] & 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
+                         (((u16)a.half[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u8 shift = (b.word[i] & 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
+                          (((u32)a.word[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vsrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u8 shift = (b.dword[i] & 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
+                          (((u64)a.dword[i] >> (shift - 1)) & 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
+                         (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
+                          (((u32)b.word[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
+                          (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
+                          (((u64)b.dword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
+                          (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vsrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
+                           (((u128)b.qword[i] >> (imm - 1)) & 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
+                           (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
+    }
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssran_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssran.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp = (s16)b.half[i] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp = (s32)b.word[i] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp = (s64)b.dword[i] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrani.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp = (s128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
+             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
+             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrarn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, -128, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<s16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, -32768, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<s32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] >> imm) +
+             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrarni.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Arithemtic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] >> imm) +
+             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrln.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp = (u16)b.half[i] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 8] >> imm;
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp = (u32)b.word[i] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 4] >> imm;
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp = (u64)b.dword[i] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] >> imm;
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlni.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp = (u128)b.qword[i] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] >> imm;
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.bu.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if ((b.half[i] & 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
+             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.hu.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if ((b.word[i] & 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
+             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vssrlrn.wu.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if ((b.dword[i] & 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
+             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 127);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.bu.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (i < 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
+    }
+    dst.byte[i] = clamp<u16>(temp, 0, 255);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 32767);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.hu.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if (i < 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
+    }
+    dst.half[i] = clamp<u32>(temp, 0, 65535);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.wu.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if (i < 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] >> imm) +
+             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
+    }
+    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600042
3C500041
+

__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)

+

Synopsis

+
__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include <lsxintrin.h>
+Instruction: vssrlrni.du.q vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if (i < 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] >> imm) +
+             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
+    }
+    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600032
3C500032
+

__m128i __lsx_vrotr_b (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_b (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.b vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] =
+      (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotr_h (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_h (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
+                (a.half[i] << (16 - (b.half[i] & 0xf)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotr_w (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_w (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
+                (a.word[i] << (32 - (b.word[i] & 0x1f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotr_d (__m128i a, __m128i b)

+

Synopsis

+
__m128i __lsx_vrotr_d (__m128i a, __m128i b)
+#include <lsxintrin.h>
+Instruction: vrotr.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by elements in b, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
+                 (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)

+

Synopsis

+
__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 8-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)

+

Synopsis

+
__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 16-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)

+

Synopsis

+
__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 32-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+

__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)

+

Synopsis

+
__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)
+#include <lsxintrin.h>
+Instruction: vrotri.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Rotate right the unsigned 64-bit elements in a by imm, store the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500022
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + + diff --git a/lsx/shuffling/index.html b/lsx/shuffling/index.html new file mode 100644 index 0000000..b365f94 --- /dev/null +++ b/lsx/shuffling/index.html @@ -0,0 +1,609 @@ + + + + + + + + Shuffling - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + + +
  • +
  • +
+
+
+
+
+ +

Shuffling

+

__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle bytes from a and b with indices from c.

+

Caveat: the indices are placed in c, while in other vshuf intrinsics, they are placed in a.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  if (c.byte[i] >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.byte[i] = 0;
+  } else if ((c.byte[i] % 32) < 16) {
+    dst.byte[i] = b.byte[c.byte[i] % 16];
+  } else {
+    dst.byte[i] = a.byte[c.byte[i] % 16];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle 16-bit elements in b and c with indices from a, save the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.half[i] = 0;
+  } else if ((a.half[i] % 16) < 8) {
+    dst.half[i] = c.half[a.half[i] % 8];
+  } else {
+    dst.half[i] = b.half[a.half[i] % 8];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle 32-bit elements in b and c with indices from a, save the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.word[i] = 0;
+  } else if ((a.word[i] % 8) < 4) {
+    dst.word[i] = c.word[a.word[i] % 4];
+  } else {
+    dst.word[i] = b.word[a.word[i] % 4];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)

+

Synopsis

+
__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include <lsxintrin.h>
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+
+

Description

+

Shuffle 64-bit elements in b and c with indices from a, save the result to dst.

+

Operation

+
for (int i = 0; i < 2; i++) {
+  if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.dword[i] = 0;
+  } else if ((a.dword[i] % 4) < 2) {
+    dst.dword[i] = c.dword[a.dword[i] % 2];
+  } else {
+    dst.dword[i] = b.dword[a.dword[i] % 2];
+  }
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600012
3C500012
+

__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.b vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 8-bit elements in a with indices packed in imm, save the result to dst.

+

Operation

+
for (int i = 0; i < 16; i++) {
+  dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.h vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 16-bit elements in a with indices packed in imm, save the result to dst.

+

Operation

+
for (int i = 0; i < 8; i++) {
+  dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.w vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 32-bit elements in a with indices packed in imm, save the result to dst.

+

Operation

+
for (int i = 0; i < 4; i++) {
+  dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
+}
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+

__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)

+

Synopsis

+
__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)
+#include <lsxintrin.h>
+Instruction: vshuf4i.d vr, vr, imm
+CPU Flags: LSX
+
+

Description

+

Shuffle every four 64-bit elements in a and b with indices packed in imm, save the result to dst.

+

Operation

+
dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
+dst.dword[1] =
+    (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];
+
+

Tested on real machine.

+

Latency and Throughput

+ + + + + + + + + + + + + + + + + + + + +
CPULatencyThroughput (CPI)
3A600014
3C500012
+ +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + +
+ + + + + + + + diff --git a/main.css b/main.css new file mode 100644 index 0000000..f7ffbf8 --- /dev/null +++ b/main.css @@ -0,0 +1,3 @@ +[v-cloak] { + display: none +} \ No newline at end of file diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..47c9e0e --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,168 @@ + + + + https://jia.je/unofficial-loongarch-intrinsics-guide/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/latency_throughput/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/viewer/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/bitwise_operations/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/branch/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_comparison/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_computation/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_conversion/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_misc/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/fma/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_comparison/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_computation/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/logical/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/memory/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/permutation/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shift/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shuffling/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/bitwise_operations/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/branch/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_comparison/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_computation/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_conversion/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_misc/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/fma/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_comparison/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_computation/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/logical/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/memory/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/misc/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/permutation/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shift/ + 2023-12-14 + daily + + + https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shuffling/ + 2023-12-14 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 0000000..2a938d2 Binary files /dev/null and b/sitemap.xml.gz differ diff --git a/viewer/index.html b/viewer/index.html new file mode 100644 index 0000000..eaa1509 --- /dev/null +++ b/viewer/index.html @@ -0,0 +1,296 @@ + + + + + + + + All Intrinsics - Unofficial LoongArch Intrinsics Guide + + + + + + + + + + + + + + +
+ + +
+ +
+
+
    +
  • + +
  • +
  • +
+
+
+
+
+ +

All Intrinsics

+
+ +
+ Loading... Please wait... +

+
+ +
+ +Categories: +

+ +
+ + +
+ +

+Instruction Set Extensions: +

+ +
+ + +
+ +

+Filter by content: +

+ + + +

+

+ +Found {{intrinsics.length}} intrinsics. + +

+

+

+
+ {{ intrinsic.name }} +
+
+ +

+

+ +
+
+ + + +
+
+ +
+
+ +
+ +
+ +
+ + + + « Previous + + + Next » + + +
+ + + + + + + +