pokt-scan · jorgecuesta · Jun 20, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/apps/nodejs/web/.eslintrc.json b/apps/nodejs/web/.eslintrc.json
@@ -0,0 +1,6 @@
+{
+  "extends": [
+    "next",
+    "prettier"
+  ]
+}
diff --git a/apps/nodejs/web/.gitignore b/apps/nodejs/web/.gitignore
@@ -0,0 +1,36 @@
+# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+
+# dependencies
+/node_modules
+/.pnp
+.pnp.js
+.yarn/install-state.gz
+
+# testing
+/coverage
+
+# next.js
+/.next/
+/out/
+
+# production
+/build
+
+# misc
+.DS_Store
+*.pem
+
+# debug
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+
+# local env files
+.env.local
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
diff --git a/apps/nodejs/web/.prettierrc.json b/apps/nodejs/web/.prettierrc.json
@@ -0,0 +1,12 @@
+{
+  "tabWidth": 2,
+  "useTabs": false,
+  "semi": false,
+  "singleQuote": true,
+  "trailingComma": "es5",
+  "bracketSpacing": true,
+  "bracketSameLine": false,
+  "arrowParens": "always",
+  "endOfLine": "lf",
+  "printWidth": 100
+}
diff --git a/apps/nodejs/web/README.md b/apps/nodejs/web/README.md
@@ -0,0 +1,28 @@
+This is a [Next.js](https://nextjs.org/) project.
+
+## Getting Started
+
+Add a .env.local file with:
+
+- API_ENDPOINT_URL: this should be the url where the data of the table is going to be fetched. For
+  example: http://localhost:3001/benchmark_data
+- SHOW_STDERR: to enable the standard deviation use this env with "true"
+
+Run the server in development mode with:
+
+```bash
+pnpm dev
+```
+
+Build the server in production mode with:
+
+```bash
+pnpm build
+```
+
+Start the server built with:
+
+```bash
+pnpm start
+```
+
diff --git a/apps/nodejs/web/app/AppBar/AppBar.tsx b/apps/nodejs/web/app/AppBar/AppBar.tsx
@@ -0,0 +1,47 @@
+import MUIAppBar from '@mui/material/AppBar'
+import Toolbar from '@mui/material/Toolbar'
+import Stack from '@mui/material/Stack'
+import React from 'react'
+import Logo from '../assets/logo/logo.svg'
+import ToggleThemeButton from '@/app/AppBar/ToggleTheme'
+
+const height = '60px!important'
+export default function AppBar() {
+  return (
+    <>
+      <MUIAppBar
+        elevation={0}
+        sx={{
+          height,
+          paddingX: 1,
+        }}
+      >
+        <Toolbar
+          sx={{
+            height,
+            minHeight: height,
+            width: '100%',
+            alignItems: 'center',
+            justifyContent: 'space-between',
+            paddingX: { xs: '5px!important', sm: '10px!important', xl: '20px!important' },
+          }}
+        >
+          <Stack
+            marginLeft={-1}
+            alignItems={'center'}
+            justifyContent={'center'}
+            height={{ xs: 30, sm: 40, md: 50 }}
+            width={{ xs: 70, sm: 100, md: 120 }}
+          >
+            <Logo className={'logo'} viewBox={'0 0 200 84'} />
+          </Stack>
+          <ToggleThemeButton />
+        </Toolbar>
+      </MUIAppBar>
+      <Toolbar sx={{
+        height,
+        minHeight: height,
+      }} />
+    </>
+  )
+}
diff --git a/apps/nodejs/web/app/AppBar/ToggleTheme.tsx b/apps/nodejs/web/app/AppBar/ToggleTheme.tsx
@@ -0,0 +1,32 @@
+'use client'
+
+import {useTheme} from '@mui/material'
+import IconButton from '@mui/material/IconButton'
+import DarkThemeIcon from './assets/icons/dark_icon_theme.svg'
+import LightThemeIcon from './assets/icons/light_icon_theme.svg'
+
+export default function ToggleThemeButton() {
+  const isLight = useTheme().palette.mode === 'light'
+
+  const toggleTheme = () => {
+    const newTheme = !isLight ? 'light' : 'dark'
+    window.dispatchEvent(new CustomEvent('themeChanged', {detail: newTheme}))
+  }
+
+  return (
+    <IconButton
+      sx={{
+        padding: 0,
+        height: 40,
+        width: 40,
+      }}
+      onClick={toggleTheme}
+    >
+      {isLight ? (
+        <DarkThemeIcon viewBox="0 0 40 40"/>
+      ) : (
+        <LightThemeIcon viewBox="0 0 40 40"/>
+      )}
+    </IconButton>
+  )
+}
diff --git a/apps/nodejs/web/app/AppBar/assets/icons/dark_icon_theme.svg b/apps/nodejs/web/app/AppBar/assets/icons/dark_icon_theme.svg
diff --git a/apps/nodejs/web/app/AppBar/assets/icons/light_icon_theme.svg b/apps/nodejs/web/app/AppBar/assets/icons/light_icon_theme.svg
diff --git a/apps/nodejs/web/app/Home/API.tsx b/apps/nodejs/web/app/Home/API.tsx
@@ -0,0 +1,14 @@
+import Typography from '@mui/material/Typography'
+
+export default function API() {
+  return (
+    <>
+      <Typography variant={'h1'} fontSize={26} fontWeight={500}>API</Typography>
+      <Typography fontSize={14} marginTop={1}>
+        Want to use this data live to feed your App or POKT Network portal?
+        <br />
+        We got you, this will be deployed with an API to query the inference node's performance.
+      </Typography>
+    </>
+  )
+}
diff --git a/apps/nodejs/web/app/Home/About.tsx b/apps/nodejs/web/app/Home/About.tsx
@@ -0,0 +1,133 @@
+import Typography from '@mui/material/Typography'
+import Stack from '@mui/material/Stack'
+
+export default function About() {
+  return (
+    <Stack sx={{
+      '& p': {
+        marginTop: 1,
+      },
+      'p, a': {
+        fontSize: 14,
+      },
+      '& h2, h3': {
+        marginTop: 2,
+      },
+      '& a': {
+        textDecoration: 'none',
+        color: '#4379ff',
+        fontWeight: 600,
+        '&:hover': {
+          textDecoration: 'underline',
+        },
+      },
+    }}>
+      <Typography variant={'h1'} fontSize={26} fontWeight={500}>About</Typography>
+      <Typography>
+        Evaluating a language model (LM) is a complex task that involves analyzing many different aspect of its
+        capabilities: from recall to solving math models. An effort to simplify these tasks was the creation of
+        leaderboards such as the{' '}
+        <a
+          href={'https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard'}
+          target={'_blank'}
+        >
+          Open LLM Leaderboard
+        </a> by HuggingFace.
+      </Typography>
+      <Typography>
+        This leaderboard is an effort to provide the POKT Network users with the same information that they are used to
+        look at when choosing an open LLM, but with the following advantages:
+      </Typography>
+      <ul>
+        <li>
+          <Typography>
+            <strong>What you see is what you get:</strong> You are not looking at model names, you are looking at actual
+            inference endpoints.
+          </Typography>
+        </li>
+        <li>
+          <Typography>
+            <strong>Live Data:</strong> We run these tests 24-7, the scores are updated each time the inference node
+            enters in session.
+          </Typography>
+        </li>
+        <li>
+          <Typography>
+            <strong>Trustless and Permissionless:</strong> If you connect your model to the POKT Network, we will track
+            it, we don't care who is behind the node or what they claim about it. We test and report, period.
+          </Typography>
+        </li>
+      </ul>
+      <Typography variant={'h3'} fontSize={20} fontWeight={500}>Tasks</Typography>
+      <Typography>
+        Following Hugging Face's Open LLM Leaderboard, we evaluate models on 6 key benchmarks using our Machine Learning
+        Test Bench that implements the Eleuther AI Language Model Evaluation Harness under the hood. The tasks
+        implemented are:
+      </Typography>
+      <ul>
+        <li>
+          <Typography>
+            <a href={'https://arxiv.org/abs/1803.05457'} target={'_blank'}>AI2 Reasoning Challenge</a> (25-shot) - a set
+            of grade-school science
+            questions.
+          </Typography>
+        </li>
+        <li>
+          <Typography>
+            <a href={'https://arxiv.org/abs/1905.07830'} target={'_blank'}>HellaSwag</a> (10-shot) - a test of
+            commonsense inference, which is easy for
+            humans (~95%) but challenging for SOTA models.
+          </Typography>
+        </li>
+        <li>
+          <Typography>
+            <a href={'https://arxiv.org/abs/2009.03300'} target={'_blank'}>MMLU</a> (5-shot) - a test to measure a text
+            model's multitask accuracy. The
+            test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
+          </Typography>
+        </li>
+        <li>
+          <Typography>
+            <a href={'https://arxiv.org/abs/2109.07958'} target={'_blank'}>TruthfulQA</a> (0-shot) - a test to measure a
+            model's propensity to reproduce
+            falsehoods commonly found online. Note: TruthfulQA is technically a 6-shot task in the Harness because each
+            example is prepended with 6 Q/A pairs, even in the 0-shot setting.
+          </Typography>
+        </li>
+        <li>
+          <Typography>
+            <a href={'https://arxiv.org/abs/1907.10641'} target={'_blank'}>Winogrande</a> (5-shot) - an adversarial and
+            difficult Winograd benchmark at
+            scale, for commonsense reasoning.
+          </Typography>
+        </li>
+        <li>
+          <Typography>
+            <a href={'https://arxiv.org/abs/2110.14168'} target={'_blank'}>GSM8k</a> (5-shot) - diverse grade school
+            math word problems to measure a
+            model's ability to solve multi-step mathematical reasoning problems (only partial match).
+          </Typography>
+        </li>
+      </ul>
+      <Typography>
+        Since we are dealing with live endpoints, we do not sample the whole datasets, instead we sample 50 samples for
+        each task (or sub-task in the case of MMLU). The effect on the tests accuracy is less than <code>5%</code> (<a
+        href={'http://arxiv.org/abs/2402.14992'} target={'_blank'}>tinyBenchmarks,
+        Polo et al.</a>).
+      </Typography>
+
+      <Typography>Remember, a higher score is a better score!</Typography>
+
+      <Typography variant={'h3'} fontSize={20} fontWeight={500}>Reproducibility</Typography>
+      <Typography>
+        The code used to produce these results is completely available in our repository <a
+        href={'https://github.com/pokt-scan/pocket-ml-testbench'} target={'_blank'}>Machine Learning Test Bench</a>.
+        Remember that the results will not be numerically exact as they depend on a random sample of the complete
+        dataset and also the available nodes may vary their performance with time.
+        <br />
+        You will also need a POKT Network App to connect to the network and consume relays, this will have a cost to be
+        paid in POKT.
+      </Typography>
+    </Stack>
+  )
+}