green.html

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
    <!-- Replace the content tag with appropriate information -->
    <meta name="description" content="Radiology Report Generation Metric">
    <meta property="og:title" content="GREEN"/>
    <meta property="og:description" content="GREEN: Generative Radiology Report Evaluation and Error Notation"/>
    <meta property="og:url" content="URL OF THE WEBSITE"/>
    <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
    <meta property="og:image" content=""/>
    <meta property="og:image:width" content="1200"/>
    <meta property="og:image:height" content="630"/>


    <meta name="twitter:title" content="GREEN">
    <meta name="twitter:description" content="GREEN: Generative Radiology Report Evaluation and Error Notation">
    <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
    <meta name="twitter:card" content="summary_large_image">
    <!-- Keywords for your paper to be indexed by-->
    <meta name="keywords" content="Radiology Report Generation, LLM, Evaluation">
    <meta name="viewport" content="width=device-width, initial-scale=1">


    <title>GREEN: Generative Radiology Report Evaluation and Error Notation</title>
    <link rel="icon" type="image/x-icon" href="src/static-chexagent/images/favicon.png">
    <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">

    <link rel="stylesheet" href="src/static-chexagent/css/bulma.min.css">
    <link rel="stylesheet" href="src/static-chexagent/css/bulma-carousel.min.css">
    <link rel="stylesheet" href="src/static-chexagent/css/bulma-slider.min.css">
    <link rel="stylesheet" href="src/static-chexagent/css/fontawesome.all.min.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
    <link rel="stylesheet" href="src/static-chexagent/css/index.css">

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
    <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
    <script defer src="src/static-chexagent/js/fontawesome.all.min.js"></script>
    <script src="src/static-chexagent/js/bulma-carousel.min.js"></script>
    <script src="src/static-chexagent/js/bulma-slider.min.js"></script>
    <script src="src/static-chexagent/js/index.js"></script>
</head>
<body>


<section class="hero">
    <div class="hero-body">
        <div class="container is-max-desktop">
            <div class="columns is-centered">
                <div class="column has-text-centered">
                    <h1 class="title is-1 publication-title">GREEN: Generative Radiology Report Evaluation and Error Notation</h1>
                    <div class="is-size-5 publication-authors">
                        <!-- Paper authors -->
                        <span class="author-block"><a href="https://scholar.google.com/citations?user=Q2SCA-sAAAAJ&hl=en" target="_blank">Sophie Ostmeier</a><sup></sup>,</span>
                        <span class="author-block"><a href="https://github.com/justin13601" target="_blank">Justin Xu</a><sup></sup>,</span>
                        <span class="author-block"><a href="https://zhjohnchan.github.io/" target="_blank">Zhihong Chen</a><sup></sup>,</span>
                        <span class="author-block"><a href="https://maya-varma.com/" target="_blank">Maya Varma</a><sup></sup>,</span>
                        <span class="author-block"><a href="https://scholar.google.ch/citations?user=ao8Yld4AAAAJ&hl=de" target="_blank">Christian Bluethgen</a>,</span>
                        <span class="author-block"><a href="https://www.linkedin.com/in/arne-michalson-2b3aa2271/" target="_blank">Arne Edward Michalson</a>,</span>
                        <span class="author-block"><a href="https://profiles.stanford.edu/michael-moseley" target="_blank">Michael Moseley</a>,</span>
                        <span class="author-block"><a href="https://profiles.stanford.edu/curtis-langlotz" target="_blank">Curtis Langlotz</a></span>
                        <span class="author-block"><a href="https://profiles.stanford.edu/akshay-chaudhari" target="_blank">Akshay S. Chaudhari</a><sup>*</sup>,</span>
                        <span class="author-block"><a href="https://jbdel.github.io/" target="_blank">Jean-Benoit Delbrouck</a><sup>*</sup>,</span>

                    </div>

                    <div class="is-size-5 publication-authors">
                        <span class="author-block">Stanford University, <br>May 2024</span>
                        <span class="eql-cntrb"><small><br><sup>*</sup>Equal Contribution</small></span>
                    </div>

                    <div class="column has-text-centered">
                        <div class="publication-links">
                            <!-- ArXiv abstract Link -->
                            <span class="link-block">
                                <a href="https://arxiv.org/pdf/2405.03595" target="_blank"
                                   class="external-link button is-normal is-rounded is-dark">
                                    <span class="icon">
                                        <i class="ai ai-arxiv"></i>
                                    </span>
                                    <span>arXiv</span>
                                </a>
                            </span>

                            <!-- Github link -->
                            <span class="link-block">
                                <a href="https://github.com/Stanford-AIMI/GREEN" target="_blank"
                                   class="external-link button is-normal is-rounded is-dark">
                                    <span class="icon">
                                        <i class="fab fa-github"></i>
                                    </span>
                                    <span>Code</span>
                                </a>
                            </span>

                            <!-- HuggingFace link -->
                            <span class="link-block">
                                <a href="https://huggingface.co/StanfordAIMI/GREEN-radllama2-7b" target="_blank"
                                   class="external-link button is-normal is-rounded is-dark">
                                    <span class="icon">
                                        <i class="fas fa-smile"></i>
                                    </span>
                                    <span>HuggingFace</span>
                                </a>
                            </span>

                        </div>
                    </div>
                </div>
            </div>
        </div>
    </div>
</section>


<!-- Paper abstract -->
<section class="section hero is-light">
    <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
            <div class="column is-four-fifths">
                <h2 class="title is-3">Abstract</h2>
                <div class="content has-text-justified">
                    <p>
                        Evaluating radiology reports is a challenging problem as factual correctness is extremely important due to its medical nature. Existing automatic evaluation metrics either suffer from failing to consider factual correctness (e.g., BLEU and ROUGE) or are limited in their interpretability (e.g., F1CheXpert and F1RadGraph). In this paper, we introduce GREEN (Generative Radiology Report Evaluation and Error Notation), a radiology report generation metric that leverages the natural language understanding of language models to identify and explain clinically significant errors in candidate reports, both quantitatively and qualitatively. Compared to current metrics, GREEN offers a score aligned with expert preferences, human interpretable explanations of clinically significant errors, enabling feedback loops with end-users, and  a lightweight open-source method that reaches the performance of commercial counterparts. We validate our GREEN metric by comparing it to GPT-4, as well as to the error counts of 6 experts and the preferences of 2 experts. Our method demonstrates not only a higher correlation with expert error counts but simultaneously higher alignment with expert preferences when compared to previous approaches.</p>
                    <figure>
                        <img src="src/static-green/images/motivation.png" alt="Overview" class="center-image blend-img-background">
                    </figure>
                </div>
            </div>
        </div>
    </div>
</section>
<!-- End paper abstract -->

<!-- CheXinstruct -->
<script type="module" src="https://gradio.s3-us-west-2.amazonaws.com/4.14.0/gradio.js"></script>
<section class="section hero">
    <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
            <div class="column is-four-fifths">
                <h2 class="title is-3">Metric</h2>
                <div class="content has-text-justified">
                    <p>
                        We are introducing a novel metric named GREEN (Generative Radiology Evaluation and Error Notation), designed to assess the quality of radiology reports produced by machine learning models. Leveraging the advanced natural language understanding capabilities of large language models, GREEN accurately identifies and elucidates clinically significant discrepancies between reference and generated reports. This metric generates <br><br>
                        1) a detailed score ranging from 0 to 1 for quantitative analysis,<br>
                        2) a comprehensive summary for qualitative analysis. <br><br>
                        Such interpretable evaluation makes GREEN a tool for providing user feedback and enhancing the quality of automated radiology reporting. 
                    </p>
                    <figure>
                        <img src="src/static-green/images/summary.png" alt="CheXinstruct" class="center-image blend-img-background">
                    </figure>
                </div>
            </div>
        </div>
    </div>
</section>
<!-- End CheXinstruct -->

<!-- CheXbench -->
<script type="module" src="https://gradio.s3-us-west-2.amazonaws.com/4.14.0/gradio.js"></script>
<section class="section hero is-light">
    <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
            <div class="column is-four-fifths">
                <h2 class="title is-3">Development</h2>
                <div class="content has-text-justified">
                    <p>
                        To build our training dataset, we compiled 100,000 pairs of reference and generated candidate radiology reports from six chest X-ray datasets. These datasets include MIMIC-CXR, MIMIC-PRO, CandidPTX, PadChest, BIMCV-covid19, and OpenI, with generation guided by GPT-4 to highlight differences in predefined clinical categories. Pairing strategies ranged from random matching to semantic similarity and RadGraph permutations, ensuring diversity with 174,329 unique reports. We further pre-trained models with medical text datasets including MIMIC-IV Radiology and Discharge Summaries, MIMIC-CXR reports, PubMed content, Wiki Medical Terms, and Medical Guidelines. We then train a variety of open-source large language models on our training set to outperform previous approaches quantitatively and qualitatively.
                    </p>
                    <figure>
                        <img src="src/static-green/images/training.png" alt="CheXbench" class="center-image blend-img-background">
                    </figure>
                </div>
            </div>
        </div>
    </div>
</section>
<!-- End CheXbench -->

<!-- CheXbench -->
<script type="module" src="https://gradio.s3-us-west-2.amazonaws.com/4.14.0/gradio.js"></script>
<section class="section hero">
    <div class="container is-max-desktop">
        <div class="columns is-centered has-text-centered">
            <div class="column is-four-fifths">
                <h2 class="title is-3">Validation</h2>
                <div class="content has-text-justified">
                    <p>
                        Our validation shows that the GREEN score closely approximates the error assessment of an average radiologist, with a significant error difference of 1.54, nearly matching GPT-4's performance. With that GREEN also approaches the average inter-expert difference, indicating high fidelity in error evaluation. Comparative analysis of metrics revealed that both versions of GREEN exhibit stronger correlations with total radiologist error counts than conventional metrics.
                    </p>
                    <figure>
                        <img src="src/static-green/images/eval.png" alt="CheXbench" class="center-image blend-img-background">
                    </figure>
                    <p>
                        Additionally, GREEN error counts achieved a correlation coefficient of 0.79, demonstrating superior performance compared to other metrics, including GPT -4-based models. This robust correlation, along with high expert preference alignment, highlights GREEN's efficacy and potential as an interpretable tool for medical report evaluation.
                    </p>
                    <figure>
                        <img src="src/static-green/images/preferences.png" alt="CheXbench" class="center-image blend-img-background">
                    </figure>
                </div>
            </div>
        </div>
    </div>
</section>
<!-- End CheXbench -->

<!--BibTex citation -->
<section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
        <h2 class="title">BibTeX</h2>
        <pre><code>@article{ostmeier2024green,
            title={GREEN: Generative Radiology Report Evaluation and Error Notation},
            author={Ostmeier, Sophie and Xu, Justin and Chen, Zhihong and Varma, Maya and Blankemeier, Louis and Bluethgen, Christian and Michalson, Arne Edward and Moseley, Michael and Langlotz, Curtis and Chaudhari, Akshay S and others},
            journal={arXiv preprint arXiv:2405.03595},
            year={2024}
          }
</code></pre>
    </div>
</section>
<!--End BibTex citation -->


<footer class="footer">
    <div class="container">
        <div class="columns is-centered">
            <div class="column is-8">
                <div class="content">
                    <p>
                        Last update: 2024/05. Template credited to <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a>.
                    </p>
                </div>
            </div>
        </div>
    </div>
</footer>

</body>
</html>